blob: dde16ef78510b0d331e830d39da3b47b13ad0a70 [file] [log] [blame]
Guido van Rossuma831cac2000-03-10 23:23:21 +00001""" Test script for the Unicode implementation.
2
Guido van Rossuma831cac2000-03-10 23:23:21 +00003Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
Marc-André Lemburg36619082001-01-17 19:11:13 +00007"""#"
Tim Peters2f228e72001-05-13 00:19:31 +00008from test_support import verify, verbose, TestFailed
Guido van Rossuma831cac2000-03-10 23:23:21 +00009import sys
10
11def test(method, input, output, *args):
12 if verbose:
Guido van Rossum15ffc712000-11-29 12:13:59 +000013 print '%s.%s%s =? %s... ' % (repr(input), method, args, repr(output)),
Guido van Rossuma831cac2000-03-10 23:23:21 +000014 try:
15 f = getattr(input, method)
16 value = apply(f, args)
17 except:
18 value = sys.exc_type
Guido van Rossum66503202000-04-28 20:39:58 +000019 exc = sys.exc_info()[:2]
Guido van Rossuma831cac2000-03-10 23:23:21 +000020 else:
21 exc = None
Guido van Rossum15ffc712000-11-29 12:13:59 +000022 if value != output or type(value) is not type(output):
Guido van Rossuma831cac2000-03-10 23:23:21 +000023 if verbose:
24 print 'no'
25 print '*',f, `input`, `output`, `value`
26 if exc:
Guido van Rossum66503202000-04-28 20:39:58 +000027 print ' value == %s: %s' % (exc)
Guido van Rossuma831cac2000-03-10 23:23:21 +000028 else:
29 if verbose:
30 print 'yes'
31
32test('capitalize', u' hello ', u' hello ')
33test('capitalize', u'hello ', u'Hello ')
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000034test('capitalize', u'aaaa', u'Aaaa')
35test('capitalize', u'AaAa', u'Aaaa')
Guido van Rossuma831cac2000-03-10 23:23:21 +000036
Marc-André Lemburg3a645e42001-01-16 11:54:12 +000037test('count', u'aaa', 3, u'a')
38test('count', u'aaa', 0, u'b')
39test('count', 'aaa', 3, u'a')
40test('count', 'aaa', 0, u'b')
41test('count', u'aaa', 3, 'a')
42test('count', u'aaa', 0, 'b')
43
Guido van Rossuma831cac2000-03-10 23:23:21 +000044test('title', u' hello ', u' Hello ')
45test('title', u'hello ', u'Hello ')
46test('title', u"fOrMaT thIs aS titLe String", u'Format This As Title String')
47test('title', u"fOrMaT,thIs-aS*titLe;String", u'Format,This-As*Title;String')
48test('title', u"getInt", u'Getint')
49
50test('find', u'abcdefghiabc', 0, u'abc')
51test('find', u'abcdefghiabc', 9, u'abc', 1)
52test('find', u'abcdefghiabc', -1, u'def', 4)
53
54test('rfind', u'abcdefghiabc', 9, u'abc')
55
56test('lower', u'HeLLo', u'hello')
57test('lower', u'hello', u'hello')
58
59test('upper', u'HeLLo', u'HELLO')
60test('upper', u'HELLO', u'HELLO')
61
62if 0:
63 transtable = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`xyzdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377'
64
65 test('maketrans', u'abc', transtable, u'xyz')
66 test('maketrans', u'abc', ValueError, u'xyzq')
67
68test('split', u'this is the split function',
69 [u'this', u'is', u'the', u'split', u'function'])
70test('split', u'a|b|c|d', [u'a', u'b', u'c', u'd'], u'|')
71test('split', u'a|b|c|d', [u'a', u'b', u'c|d'], u'|', 2)
72test('split', u'a b c d', [u'a', u'b c d'], None, 1)
73test('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
74test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 3)
75test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 4)
76test('split', u'a b c d', [u'a b c d'], None, 0)
77test('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
78test('split', u'a b c d ', [u'a', u'b', u'c', u'd'])
Guido van Rossum8b264542000-12-19 02:22:31 +000079test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
80test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], '//')
81test('split', 'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
82test('split', u'endcase test', [u'endcase ', u''], u'test')
83test('split', u'endcase test', [u'endcase ', u''], 'test')
84test('split', 'endcase test', [u'endcase ', u''], u'test')
85
Guido van Rossuma831cac2000-03-10 23:23:21 +000086
87# join now works with any sequence type
88class Sequence:
Guido van Rossum15ffc712000-11-29 12:13:59 +000089 def __init__(self, seq): self.seq = seq
Guido van Rossuma831cac2000-03-10 23:23:21 +000090 def __len__(self): return len(self.seq)
91 def __getitem__(self, i): return self.seq[i]
92
93test('join', u' ', u'a b c d', [u'a', u'b', u'c', u'd'])
Guido van Rossum15ffc712000-11-29 12:13:59 +000094test('join', u' ', u'a b c d', ['a', 'b', u'c', u'd'])
Guido van Rossuma831cac2000-03-10 23:23:21 +000095test('join', u'', u'abcd', (u'a', u'b', u'c', u'd'))
Guido van Rossum15ffc712000-11-29 12:13:59 +000096test('join', u' ', u'w x y z', Sequence('wxyz'))
Guido van Rossuma831cac2000-03-10 23:23:21 +000097test('join', u' ', TypeError, 7)
Guido van Rossum15ffc712000-11-29 12:13:59 +000098test('join', u' ', TypeError, Sequence([7, u'hello', 123L]))
99test('join', ' ', u'a b c d', [u'a', u'b', u'c', u'd'])
100test('join', ' ', u'a b c d', ['a', 'b', u'c', u'd'])
101test('join', '', u'abcd', (u'a', u'b', u'c', u'd'))
102test('join', ' ', u'w x y z', Sequence(u'wxyz'))
103test('join', ' ', TypeError, 7)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000104
105result = u''
106for i in range(10):
107 if i > 0:
108 result = result + u':'
109 result = result + u'x'*10
110test('join', u':', result, [u'x' * 10] * 10)
111test('join', u':', result, (u'x' * 10,) * 10)
112
113test('strip', u' hello ', u'hello')
114test('lstrip', u' hello ', u'hello ')
115test('rstrip', u' hello ', u' hello')
116test('strip', u'hello', u'hello')
117
118test('swapcase', u'HeLLo cOmpUteRs', u'hEllO CoMPuTErS')
119
120if 0:
121 test('translate', u'xyzabcdef', u'xyzxyz', transtable, u'def')
122
123 table = string.maketrans('a', u'A')
124 test('translate', u'abc', u'Abc', table)
125 test('translate', u'xyz', u'xyz', table)
126
127test('replace', u'one!two!three!', u'one@two!three!', u'!', u'@', 1)
Barry Warsaw51ac5802000-03-20 16:36:48 +0000128test('replace', u'one!two!three!', u'onetwothree', '!', '')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000129test('replace', u'one!two!three!', u'one@two@three!', u'!', u'@', 2)
130test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 3)
131test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 4)
132test('replace', u'one!two!three!', u'one!two!three!', u'!', u'@', 0)
133test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@')
134test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@')
135test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@', 2)
136
137test('startswith', u'hello', 1, u'he')
138test('startswith', u'hello', 1, u'hello')
139test('startswith', u'hello', 0, u'hello world')
140test('startswith', u'hello', 1, u'')
141test('startswith', u'hello', 0, u'ello')
142test('startswith', u'hello', 1, u'ello', 1)
143test('startswith', u'hello', 1, u'o', 4)
144test('startswith', u'hello', 0, u'o', 5)
145test('startswith', u'hello', 1, u'', 5)
146test('startswith', u'hello', 0, u'lo', 6)
147test('startswith', u'helloworld', 1, u'lowo', 3)
148test('startswith', u'helloworld', 1, u'lowo', 3, 7)
149test('startswith', u'helloworld', 0, u'lowo', 3, 6)
150
151test('endswith', u'hello', 1, u'lo')
152test('endswith', u'hello', 0, u'he')
153test('endswith', u'hello', 1, u'')
154test('endswith', u'hello', 0, u'hello world')
155test('endswith', u'helloworld', 0, u'worl')
156test('endswith', u'helloworld', 1, u'worl', 3, 9)
157test('endswith', u'helloworld', 1, u'world', 3, 12)
158test('endswith', u'helloworld', 1, u'lowo', 1, 7)
159test('endswith', u'helloworld', 1, u'lowo', 2, 7)
160test('endswith', u'helloworld', 1, u'lowo', 3, 7)
161test('endswith', u'helloworld', 0, u'lowo', 4, 7)
162test('endswith', u'helloworld', 0, u'lowo', 3, 8)
163test('endswith', u'ab', 0, u'ab', 0, 1)
164test('endswith', u'ab', 0, u'ab', 0, 0)
165
166test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi')
167test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi', 8)
168test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi', 4)
169test('expandtabs', u'abc\r\nab\tdef\ng\thi', u'abc\r\nab def\ng hi', 4)
170
171if 0:
172 test('capwords', u'abc def ghi', u'Abc Def Ghi')
173 test('capwords', u'abc\tdef\nghi', u'Abc Def Ghi')
174 test('capwords', u'abc\t def \nghi', u'Abc Def Ghi')
175
176# Comparisons:
177print 'Testing Unicode comparisons...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000178verify(u'abc' == 'abc')
179verify('abc' == u'abc')
180verify(u'abc' == u'abc')
181verify(u'abcd' > 'abc')
182verify('abcd' > u'abc')
183verify(u'abcd' > u'abc')
184verify(u'abc' < 'abcd')
185verify('abc' < u'abcd')
186verify(u'abc' < u'abcd')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000187print 'done.'
188
Marc-André Lemburge5034372000-08-08 08:04:29 +0000189if 0:
190 # Move these tests to a Unicode collation module test...
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000191
Marc-André Lemburge5034372000-08-08 08:04:29 +0000192 print 'Testing UTF-16 code point order comparisons...',
193 #No surrogates, no fixup required.
Marc-André Lemburg36619082001-01-17 19:11:13 +0000194 verify(u'\u0061' < u'\u20ac')
Marc-André Lemburge5034372000-08-08 08:04:29 +0000195 # Non surrogate below surrogate value, no fixup required
Marc-André Lemburg36619082001-01-17 19:11:13 +0000196 verify(u'\u0061' < u'\ud800\udc02')
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000197
Marc-André Lemburge5034372000-08-08 08:04:29 +0000198 # Non surrogate above surrogate value, fixup required
199 def test_lecmp(s, s2):
Tim Petersd2bf3b72001-01-18 02:22:22 +0000200 verify(s < s2 , "comparison failed on %s < %s" % (s, s2))
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000201
Marc-André Lemburge5034372000-08-08 08:04:29 +0000202 def test_fixup(s):
Fred Drake004d5e62000-10-23 17:22:08 +0000203 s2 = u'\ud800\udc01'
204 test_lecmp(s, s2)
205 s2 = u'\ud900\udc01'
206 test_lecmp(s, s2)
207 s2 = u'\uda00\udc01'
208 test_lecmp(s, s2)
209 s2 = u'\udb00\udc01'
210 test_lecmp(s, s2)
211 s2 = u'\ud800\udd01'
212 test_lecmp(s, s2)
213 s2 = u'\ud900\udd01'
214 test_lecmp(s, s2)
215 s2 = u'\uda00\udd01'
216 test_lecmp(s, s2)
217 s2 = u'\udb00\udd01'
218 test_lecmp(s, s2)
219 s2 = u'\ud800\ude01'
220 test_lecmp(s, s2)
221 s2 = u'\ud900\ude01'
222 test_lecmp(s, s2)
223 s2 = u'\uda00\ude01'
224 test_lecmp(s, s2)
225 s2 = u'\udb00\ude01'
226 test_lecmp(s, s2)
227 s2 = u'\ud800\udfff'
228 test_lecmp(s, s2)
229 s2 = u'\ud900\udfff'
230 test_lecmp(s, s2)
231 s2 = u'\uda00\udfff'
232 test_lecmp(s, s2)
233 s2 = u'\udb00\udfff'
234 test_lecmp(s, s2)
Marc-André Lemburge5034372000-08-08 08:04:29 +0000235
236 test_fixup(u'\ue000')
237 test_fixup(u'\uff61')
238
239 # Surrogates on both sides, no fixup required
Marc-André Lemburg36619082001-01-17 19:11:13 +0000240 verify(u'\ud800\udc02' < u'\ud84d\udc56')
Marc-André Lemburge5034372000-08-08 08:04:29 +0000241 print 'done.'
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000242
Guido van Rossuma831cac2000-03-10 23:23:21 +0000243test('ljust', u'abc', u'abc ', 10)
244test('rjust', u'abc', u' abc', 10)
245test('center', u'abc', u' abc ', 10)
246test('ljust', u'abc', u'abc ', 6)
247test('rjust', u'abc', u' abc', 6)
248test('center', u'abc', u' abc ', 6)
249test('ljust', u'abc', u'abc', 2)
250test('rjust', u'abc', u'abc', 2)
251test('center', u'abc', u'abc', 2)
252
253test('islower', u'a', 1)
254test('islower', u'A', 0)
255test('islower', u'\n', 0)
256test('islower', u'\u1FFc', 0)
257test('islower', u'abc', 1)
258test('islower', u'aBc', 0)
259test('islower', u'abc\n', 1)
260
261test('isupper', u'a', 0)
262test('isupper', u'A', 1)
263test('isupper', u'\n', 0)
Marc-André Lemburgef0a0322001-02-10 14:09:31 +0000264if sys.platform[:4] != 'java':
265 test('isupper', u'\u1FFc', 0)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000266test('isupper', u'ABC', 1)
267test('isupper', u'AbC', 0)
268test('isupper', u'ABC\n', 1)
269
270test('istitle', u'a', 0)
271test('istitle', u'A', 1)
272test('istitle', u'\n', 0)
273test('istitle', u'\u1FFc', 1)
274test('istitle', u'A Titlecased Line', 1)
275test('istitle', u'A\nTitlecased Line', 1)
276test('istitle', u'A Titlecased, Line', 1)
277test('istitle', u'Greek \u1FFcitlecases ...', 1)
278test('istitle', u'Not a capitalized String', 0)
279test('istitle', u'Not\ta Titlecase String', 0)
280test('istitle', u'Not--a Titlecase String', 0)
281
Marc-André Lemburg9d467412000-07-05 09:46:40 +0000282test('isalpha', u'a', 1)
283test('isalpha', u'A', 1)
284test('isalpha', u'\n', 0)
285test('isalpha', u'\u1FFc', 1)
286test('isalpha', u'abc', 1)
287test('isalpha', u'aBc123', 0)
288test('isalpha', u'abc\n', 0)
289
290test('isalnum', u'a', 1)
291test('isalnum', u'A', 1)
292test('isalnum', u'\n', 0)
293test('isalnum', u'123abc456', 1)
294test('isalnum', u'a1b3c', 1)
295test('isalnum', u'aBc000 ', 0)
296test('isalnum', u'abc\n', 0)
297
Guido van Rossuma831cac2000-03-10 23:23:21 +0000298test('splitlines', u"abc\ndef\n\rghi", [u'abc', u'def', u'', u'ghi'])
299test('splitlines', u"abc\ndef\n\r\nghi", [u'abc', u'def', u'', u'ghi'])
300test('splitlines', u"abc\ndef\r\nghi", [u'abc', u'def', u'ghi'])
301test('splitlines', u"abc\ndef\r\nghi\n", [u'abc', u'def', u'ghi'])
302test('splitlines', u"abc\ndef\r\nghi\n\r", [u'abc', u'def', u'ghi', u''])
303test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'', u'abc', u'def', u'ghi', u''])
Guido van Rossum7ee801d2000-04-11 15:37:02 +0000304test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'\n', u'abc\n', u'def\r\n', u'ghi\n', u'\r'], 1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000305
306test('translate', u"abababc", u'bbbc', {ord('a'):None})
307test('translate', u"abababc", u'iiic', {ord('a'):None, ord('b'):ord('i')})
308test('translate', u"abababc", u'iiix', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
309
Guido van Rossumd4d26842000-03-13 23:21:48 +0000310# Contains:
311print 'Testing Unicode contains method...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000312verify(('a' in u'abdb') == 1)
313verify(('a' in u'bdab') == 1)
314verify(('a' in u'bdaba') == 1)
315verify(('a' in u'bdba') == 1)
316verify(('a' in u'bdba') == 1)
317verify((u'a' in u'bdba') == 1)
318verify((u'a' in u'bdb') == 0)
319verify((u'a' in 'bdb') == 0)
320verify((u'a' in 'bdba') == 1)
321verify((u'a' in ('a',1,None)) == 1)
322verify((u'a' in (1,None,'a')) == 1)
323verify((u'a' in (1,None,u'a')) == 1)
324verify(('a' in ('a',1,None)) == 1)
325verify(('a' in (1,None,'a')) == 1)
326verify(('a' in (1,None,u'a')) == 1)
327verify(('a' in ('x',1,u'y')) == 0)
328verify(('a' in ('x',1,None)) == 0)
Guido van Rossumd4d26842000-03-13 23:21:48 +0000329print 'done.'
330
Guido van Rossuma831cac2000-03-10 23:23:21 +0000331# Formatting:
332print 'Testing Unicode formatting strings...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000333verify(u"%s, %s" % (u"abc", "abc") == u'abc, abc')
334verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3) == u'abc, abc, 1, 2.000000, 3.00')
335verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3) == u'abc, abc, 1, -2.000000, 3.00')
336verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5) == u'abc, abc, -1, -2.000000, 3.50')
337verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57) == u'abc, abc, -1, -2.000000, 3.57')
338verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57) == u'abc, abc, -1, -2.000000, 1003.57')
339verify(u"%c" % (u"a",) == u'a')
340verify(u"%c" % ("a",) == u'a')
341verify(u"%c" % (34,) == u'"')
342verify(u"%c" % (36,) == u'$')
Marc-André Lemburgef0a0322001-02-10 14:09:31 +0000343if sys.platform[:4] != 'java':
344 value = u"%r, %r" % (u"abc", "abc")
345 if value != u"u'abc', 'abc'":
346 print '*** formatting failed for "%s"' % 'u"%r, %r" % (u"abc", "abc")'
Marc-André Lemburg84625732000-06-13 12:05:36 +0000347
Marc-André Lemburg36619082001-01-17 19:11:13 +0000348verify(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"} == u'abc, def')
Marc-André Lemburg84625732000-06-13 12:05:36 +0000349try:
Marc-André Lemburgef0a0322001-02-10 14:09:31 +0000350 if sys.platform[:4] != 'java':
351 value = u"%(x)s, %(ä)s" % {'x':u"abc", u'ä'.encode('utf-8'):"def"}
352 else:
353 value = u"%(x)s, %(ä)s" % {'x':u"abc", u'ä':"def"}
Marc-André Lemburg84625732000-06-13 12:05:36 +0000354except KeyError:
355 print '*** formatting failed for "%s"' % "u'abc, def'"
356else:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000357 verify(value == u'abc, def')
Marc-André Lemburg84625732000-06-13 12:05:36 +0000358
Guido van Rossum97064862000-04-10 13:52:48 +0000359# formatting jobs delegated from the string implementation:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000360verify('...%(foo)s...' % {'foo':u"abc"} == u'...abc...')
361verify('...%(foo)s...' % {'foo':"abc"} == '...abc...')
362verify('...%(foo)s...' % {u'foo':"abc"} == '...abc...')
363verify('...%(foo)s...' % {u'foo':u"abc"} == u'...abc...')
364verify('...%(foo)s...' % {u'foo':u"abc",'def':123} == u'...abc...')
365verify('...%(foo)s...' % {u'foo':u"abc",u'def':123} == u'...abc...')
366verify('...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...1...2...3...abc...')
367verify('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...%...%s...1...2...3...abc...')
368verify('...%s...' % u"abc" == u'...abc...')
Marc-André Lemburg542fe562001-05-02 14:21:53 +0000369verify('%*s' % (5,u'abc',) == u' abc')
370verify('%*s' % (-5,u'abc',) == u'abc ')
371verify('%*.*s' % (5,2,u'abc',) == u' ab')
372verify('%*.*s' % (5,3,u'abc',) == u' abc')
373verify('%i %*.*s' % (10, 5,3,u'abc',) == u'10 abc')
374verify('%i%s %*.*s' % (10, 3, 5,3,u'abc',) == u'103 abc')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000375print 'done.'
376
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000377# Test builtin codecs
378print 'Testing builtin codecs...',
379
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000380# UTF-8 specific encoding tests:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000381verify(u'\u20ac'.encode('utf-8') == \
382 ''.join((chr(0xe2), chr(0x82), chr(0xac))) )
383verify(u'\ud800\udc02'.encode('utf-8') == \
384 ''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))) )
385verify(u'\ud84d\udc56'.encode('utf-8') == \
386 ''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))) )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000387# UTF-8 specific decoding tests
Tim Petersd2bf3b72001-01-18 02:22:22 +0000388verify(unicode(''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))),
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000389 'utf-8') == u'\U00023456' )
Tim Petersd2bf3b72001-01-18 02:22:22 +0000390verify(unicode(''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))),
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000391 'utf-8') == u'\U00010002' )
Tim Petersd2bf3b72001-01-18 02:22:22 +0000392verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))),
Marc-André Lemburg36619082001-01-17 19:11:13 +0000393 'utf-8') == u'\u20ac' )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000394
395# Other possible utf-8 test cases:
396# * strict decoding testing for all of the
397# UTF8_ERROR cases in PyUnicode_DecodeUTF8
398
399
400
Marc-André Lemburg36619082001-01-17 19:11:13 +0000401verify(unicode('hello','ascii') == u'hello')
402verify(unicode('hello','utf-8') == u'hello')
403verify(unicode('hello','utf8') == u'hello')
404verify(unicode('hello','latin-1') == u'hello')
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000405
Marc-André Lemburgb6d78fcd2000-07-07 13:46:19 +0000406class String:
407 x = ''
408 def __str__(self):
409 return self.x
410
411o = String()
412
413o.x = 'abc'
Marc-André Lemburg36619082001-01-17 19:11:13 +0000414verify(unicode(o) == u'abc')
415verify(str(o) == 'abc')
Marc-André Lemburgb6d78fcd2000-07-07 13:46:19 +0000416
417o.x = u'abc'
Marc-André Lemburg36619082001-01-17 19:11:13 +0000418verify(unicode(o) == u'abc')
419verify(str(o) == 'abc')
Marc-André Lemburgb6d78fcd2000-07-07 13:46:19 +0000420
Guido van Rossum97064862000-04-10 13:52:48 +0000421try:
422 u'Andr\202 x'.encode('ascii')
423 u'Andr\202 x'.encode('ascii','strict')
424except ValueError:
425 pass
426else:
Guido van Rossuma1374e42001-01-19 19:01:56 +0000427 raise TestFailed, "u'Andr\202'.encode('ascii') failed to raise an exception"
Marc-André Lemburg36619082001-01-17 19:11:13 +0000428verify(u'Andr\202 x'.encode('ascii','ignore') == "Andr x")
429verify(u'Andr\202 x'.encode('ascii','replace') == "Andr? x")
Guido van Rossum97064862000-04-10 13:52:48 +0000430
431try:
432 unicode('Andr\202 x','ascii')
433 unicode('Andr\202 x','ascii','strict')
434except ValueError:
435 pass
436else:
Guido van Rossuma1374e42001-01-19 19:01:56 +0000437 raise TestFailed, "unicode('Andr\202') failed to raise an exception"
Marc-André Lemburg36619082001-01-17 19:11:13 +0000438verify(unicode('Andr\202 x','ascii','ignore') == u"Andr x")
439verify(unicode('Andr\202 x','ascii','replace') == u'Andr\uFFFD x')
Guido van Rossum97064862000-04-10 13:52:48 +0000440
Marc-André Lemburg36619082001-01-17 19:11:13 +0000441verify(u'hello'.encode('ascii') == 'hello')
442verify(u'hello'.encode('utf-8') == 'hello')
443verify(u'hello'.encode('utf8') == 'hello')
444verify(u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000')
445verify(u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o')
446verify(u'hello'.encode('latin-1') == 'hello')
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000447
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000448# Roundtrip safety for BMP (just the first 1024 chars)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000449u = u''.join(map(unichr, range(1024)))
450for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
451 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
Marc-André Lemburg36619082001-01-17 19:11:13 +0000452 verify(unicode(u.encode(encoding),encoding) == u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000453
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000454# Roundtrip safety for non-BMP (just a few chars)
455u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
456for encoding in ('utf-8',
457 'utf-16', 'utf-16-le', 'utf-16-be',
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +0000458 #'raw_unicode_escape',
459 'unicode_escape', 'unicode_internal'):
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000460 verify(unicode(u.encode(encoding),encoding) == u)
461
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000462u = u''.join(map(unichr, range(256)))
Guido van Rossum9e896b32000-04-05 20:11:21 +0000463for encoding in (
464 'latin-1',
465 ):
466 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000467 verify(unicode(u.encode(encoding),encoding) == u)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000468 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000469 print '*** codec "%s" failed round-trip' % encoding
470 except ValueError,why:
471 print '*** codec for "%s" failed: %s' % (encoding, why)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000472
473u = u''.join(map(unichr, range(128)))
Guido van Rossum9e896b32000-04-05 20:11:21 +0000474for encoding in (
475 'ascii',
476 ):
477 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000478 verify(unicode(u.encode(encoding),encoding) == u)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000479 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000480 print '*** codec "%s" failed round-trip' % encoding
481 except ValueError,why:
482 print '*** codec for "%s" failed: %s' % (encoding, why)
483
484print 'done.'
485
486print 'Testing standard mapping codecs...',
487
488print '0-127...',
489s = ''.join(map(chr, range(128)))
490for encoding in (
491 'cp037', 'cp1026',
492 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
493 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
Fred Drake004d5e62000-10-23 17:22:08 +0000494 'cp863', 'cp865', 'cp866',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000495 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
496 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
497 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
498 'mac_cyrillic', 'mac_latin2',
499
500 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
501 'cp1256', 'cp1257', 'cp1258',
502 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
503
504 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Tim Peters2f228e72001-05-13 00:19:31 +0000505 'cp1006', 'iso8859_8',
Fred Drake004d5e62000-10-23 17:22:08 +0000506
Guido van Rossum9e896b32000-04-05 20:11:21 +0000507 ### These have undefined mappings:
508 #'cp424',
Fred Drake004d5e62000-10-23 17:22:08 +0000509
Tim Peters2f228e72001-05-13 00:19:31 +0000510 ### These fail the round-trip:
511 #'cp875'
512
Guido van Rossum9e896b32000-04-05 20:11:21 +0000513 ):
514 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000515 verify(unicode(s,encoding).encode(encoding) == s)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000516 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000517 print '*** codec "%s" failed round-trip' % encoding
518 except ValueError,why:
519 print '*** codec for "%s" failed: %s' % (encoding, why)
520
521print '128-255...',
522s = ''.join(map(chr, range(128,256)))
523for encoding in (
524 'cp037', 'cp1026',
525 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
526 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
Fred Drake004d5e62000-10-23 17:22:08 +0000527 'cp863', 'cp865', 'cp866',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000528 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
Tim Petersd2bf3b72001-01-18 02:22:22 +0000529 'iso8859_2', 'iso8859_4', 'iso8859_5',
Marc-André Lemburga866df82001-01-03 21:29:14 +0000530 'iso8859_9', 'koi8_r', 'latin_1',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000531 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +0000532
Guido van Rossum9e896b32000-04-05 20:11:21 +0000533 ### These have undefined mappings:
534 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
535 #'cp1256', 'cp1257', 'cp1258',
536 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Tim Petersd2bf3b72001-01-18 02:22:22 +0000537 #'iso8859_3', 'iso8859_6', 'iso8859_7',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000538 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +0000539
Guido van Rossum9e896b32000-04-05 20:11:21 +0000540 ### These fail the round-trip:
541 #'cp1006', 'cp875', 'iso8859_8',
Fred Drake004d5e62000-10-23 17:22:08 +0000542
Guido van Rossum9e896b32000-04-05 20:11:21 +0000543 ):
544 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000545 verify(unicode(s,encoding).encode(encoding) == s)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000546 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000547 print '*** codec "%s" failed round-trip' % encoding
548 except ValueError,why:
549 print '*** codec for "%s" failed: %s' % (encoding, why)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000550
551print 'done.'
Fred Drakee0243e22000-04-13 14:11:56 +0000552
553print 'Testing Unicode string concatenation...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000554verify((u"abc" u"def") == u"abcdef")
555verify(("abc" u"def") == u"abcdef")
556verify((u"abc" "def") == u"abcdef")
557verify((u"abc" u"def" "ghi") == u"abcdefghi")
558verify(("abc" "def" u"ghi") == u"abcdefghi")
Fred Drakee0243e22000-04-13 14:11:56 +0000559print 'done.'