blob: d77551642cd9a37a9c96b5a3e5a4008dfc2f35bb [file] [log] [blame]
Guido van Rossuma831cac2000-03-10 23:23:21 +00001""" Test script for the Unicode implementation.
2
Guido van Rossuma831cac2000-03-10 23:23:21 +00003Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
Marc-André Lemburg36619082001-01-17 19:11:13 +00007"""#"
8from test_support import verify, verbose
Guido van Rossuma831cac2000-03-10 23:23:21 +00009import sys
10
11def test(method, input, output, *args):
12 if verbose:
Guido van Rossum15ffc712000-11-29 12:13:59 +000013 print '%s.%s%s =? %s... ' % (repr(input), method, args, repr(output)),
Guido van Rossuma831cac2000-03-10 23:23:21 +000014 try:
15 f = getattr(input, method)
16 value = apply(f, args)
17 except:
18 value = sys.exc_type
Guido van Rossum66503202000-04-28 20:39:58 +000019 exc = sys.exc_info()[:2]
Guido van Rossuma831cac2000-03-10 23:23:21 +000020 else:
21 exc = None
Guido van Rossum15ffc712000-11-29 12:13:59 +000022 if value != output or type(value) is not type(output):
Guido van Rossuma831cac2000-03-10 23:23:21 +000023 if verbose:
24 print 'no'
25 print '*',f, `input`, `output`, `value`
26 if exc:
Guido van Rossum66503202000-04-28 20:39:58 +000027 print ' value == %s: %s' % (exc)
Guido van Rossuma831cac2000-03-10 23:23:21 +000028 else:
29 if verbose:
30 print 'yes'
31
32test('capitalize', u' hello ', u' hello ')
33test('capitalize', u'hello ', u'Hello ')
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000034test('capitalize', u'aaaa', u'Aaaa')
35test('capitalize', u'AaAa', u'Aaaa')
Guido van Rossuma831cac2000-03-10 23:23:21 +000036
Marc-André Lemburg3a645e42001-01-16 11:54:12 +000037test('count', u'aaa', 3, u'a')
38test('count', u'aaa', 0, u'b')
39test('count', 'aaa', 3, u'a')
40test('count', 'aaa', 0, u'b')
41test('count', u'aaa', 3, 'a')
42test('count', u'aaa', 0, 'b')
43
Guido van Rossuma831cac2000-03-10 23:23:21 +000044test('title', u' hello ', u' Hello ')
45test('title', u'hello ', u'Hello ')
46test('title', u"fOrMaT thIs aS titLe String", u'Format This As Title String')
47test('title', u"fOrMaT,thIs-aS*titLe;String", u'Format,This-As*Title;String')
48test('title', u"getInt", u'Getint')
49
50test('find', u'abcdefghiabc', 0, u'abc')
51test('find', u'abcdefghiabc', 9, u'abc', 1)
52test('find', u'abcdefghiabc', -1, u'def', 4)
53
54test('rfind', u'abcdefghiabc', 9, u'abc')
55
56test('lower', u'HeLLo', u'hello')
57test('lower', u'hello', u'hello')
58
59test('upper', u'HeLLo', u'HELLO')
60test('upper', u'HELLO', u'HELLO')
61
62if 0:
63 transtable = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`xyzdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377'
64
65 test('maketrans', u'abc', transtable, u'xyz')
66 test('maketrans', u'abc', ValueError, u'xyzq')
67
68test('split', u'this is the split function',
69 [u'this', u'is', u'the', u'split', u'function'])
70test('split', u'a|b|c|d', [u'a', u'b', u'c', u'd'], u'|')
71test('split', u'a|b|c|d', [u'a', u'b', u'c|d'], u'|', 2)
72test('split', u'a b c d', [u'a', u'b c d'], None, 1)
73test('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
74test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 3)
75test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 4)
76test('split', u'a b c d', [u'a b c d'], None, 0)
77test('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
78test('split', u'a b c d ', [u'a', u'b', u'c', u'd'])
Guido van Rossum8b264542000-12-19 02:22:31 +000079test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
80test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], '//')
81test('split', 'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
82test('split', u'endcase test', [u'endcase ', u''], u'test')
83test('split', u'endcase test', [u'endcase ', u''], 'test')
84test('split', 'endcase test', [u'endcase ', u''], u'test')
85
Guido van Rossuma831cac2000-03-10 23:23:21 +000086
87# join now works with any sequence type
88class Sequence:
Guido van Rossum15ffc712000-11-29 12:13:59 +000089 def __init__(self, seq): self.seq = seq
Guido van Rossuma831cac2000-03-10 23:23:21 +000090 def __len__(self): return len(self.seq)
91 def __getitem__(self, i): return self.seq[i]
92
93test('join', u' ', u'a b c d', [u'a', u'b', u'c', u'd'])
Guido van Rossum15ffc712000-11-29 12:13:59 +000094test('join', u' ', u'a b c d', ['a', 'b', u'c', u'd'])
Guido van Rossuma831cac2000-03-10 23:23:21 +000095test('join', u'', u'abcd', (u'a', u'b', u'c', u'd'))
Guido van Rossum15ffc712000-11-29 12:13:59 +000096test('join', u' ', u'w x y z', Sequence('wxyz'))
Guido van Rossuma831cac2000-03-10 23:23:21 +000097test('join', u' ', TypeError, 7)
Guido van Rossum15ffc712000-11-29 12:13:59 +000098test('join', u' ', TypeError, Sequence([7, u'hello', 123L]))
99test('join', ' ', u'a b c d', [u'a', u'b', u'c', u'd'])
100test('join', ' ', u'a b c d', ['a', 'b', u'c', u'd'])
101test('join', '', u'abcd', (u'a', u'b', u'c', u'd'))
102test('join', ' ', u'w x y z', Sequence(u'wxyz'))
103test('join', ' ', TypeError, 7)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000104
105result = u''
106for i in range(10):
107 if i > 0:
108 result = result + u':'
109 result = result + u'x'*10
110test('join', u':', result, [u'x' * 10] * 10)
111test('join', u':', result, (u'x' * 10,) * 10)
112
113test('strip', u' hello ', u'hello')
114test('lstrip', u' hello ', u'hello ')
115test('rstrip', u' hello ', u' hello')
116test('strip', u'hello', u'hello')
117
118test('swapcase', u'HeLLo cOmpUteRs', u'hEllO CoMPuTErS')
119
120if 0:
121 test('translate', u'xyzabcdef', u'xyzxyz', transtable, u'def')
122
123 table = string.maketrans('a', u'A')
124 test('translate', u'abc', u'Abc', table)
125 test('translate', u'xyz', u'xyz', table)
126
127test('replace', u'one!two!three!', u'one@two!three!', u'!', u'@', 1)
Barry Warsaw51ac5802000-03-20 16:36:48 +0000128test('replace', u'one!two!three!', u'onetwothree', '!', '')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000129test('replace', u'one!two!three!', u'one@two@three!', u'!', u'@', 2)
130test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 3)
131test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 4)
132test('replace', u'one!two!three!', u'one!two!three!', u'!', u'@', 0)
133test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@')
134test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@')
135test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@', 2)
136
137test('startswith', u'hello', 1, u'he')
138test('startswith', u'hello', 1, u'hello')
139test('startswith', u'hello', 0, u'hello world')
140test('startswith', u'hello', 1, u'')
141test('startswith', u'hello', 0, u'ello')
142test('startswith', u'hello', 1, u'ello', 1)
143test('startswith', u'hello', 1, u'o', 4)
144test('startswith', u'hello', 0, u'o', 5)
145test('startswith', u'hello', 1, u'', 5)
146test('startswith', u'hello', 0, u'lo', 6)
147test('startswith', u'helloworld', 1, u'lowo', 3)
148test('startswith', u'helloworld', 1, u'lowo', 3, 7)
149test('startswith', u'helloworld', 0, u'lowo', 3, 6)
150
151test('endswith', u'hello', 1, u'lo')
152test('endswith', u'hello', 0, u'he')
153test('endswith', u'hello', 1, u'')
154test('endswith', u'hello', 0, u'hello world')
155test('endswith', u'helloworld', 0, u'worl')
156test('endswith', u'helloworld', 1, u'worl', 3, 9)
157test('endswith', u'helloworld', 1, u'world', 3, 12)
158test('endswith', u'helloworld', 1, u'lowo', 1, 7)
159test('endswith', u'helloworld', 1, u'lowo', 2, 7)
160test('endswith', u'helloworld', 1, u'lowo', 3, 7)
161test('endswith', u'helloworld', 0, u'lowo', 4, 7)
162test('endswith', u'helloworld', 0, u'lowo', 3, 8)
163test('endswith', u'ab', 0, u'ab', 0, 1)
164test('endswith', u'ab', 0, u'ab', 0, 0)
165
166test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi')
167test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi', 8)
168test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi', 4)
169test('expandtabs', u'abc\r\nab\tdef\ng\thi', u'abc\r\nab def\ng hi', 4)
170
171if 0:
172 test('capwords', u'abc def ghi', u'Abc Def Ghi')
173 test('capwords', u'abc\tdef\nghi', u'Abc Def Ghi')
174 test('capwords', u'abc\t def \nghi', u'Abc Def Ghi')
175
176# Comparisons:
177print 'Testing Unicode comparisons...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000178verify(u'abc' == 'abc')
179verify('abc' == u'abc')
180verify(u'abc' == u'abc')
181verify(u'abcd' > 'abc')
182verify('abcd' > u'abc')
183verify(u'abcd' > u'abc')
184verify(u'abc' < 'abcd')
185verify('abc' < u'abcd')
186verify(u'abc' < u'abcd')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000187print 'done.'
188
Marc-André Lemburge5034372000-08-08 08:04:29 +0000189if 0:
190 # Move these tests to a Unicode collation module test...
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000191
Marc-André Lemburge5034372000-08-08 08:04:29 +0000192 print 'Testing UTF-16 code point order comparisons...',
193 #No surrogates, no fixup required.
Marc-André Lemburg36619082001-01-17 19:11:13 +0000194 verify(u'\u0061' < u'\u20ac')
Marc-André Lemburge5034372000-08-08 08:04:29 +0000195 # Non surrogate below surrogate value, no fixup required
Marc-André Lemburg36619082001-01-17 19:11:13 +0000196 verify(u'\u0061' < u'\ud800\udc02')
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000197
Marc-André Lemburge5034372000-08-08 08:04:29 +0000198 # Non surrogate above surrogate value, fixup required
199 def test_lecmp(s, s2):
Tim Petersd2bf3b72001-01-18 02:22:22 +0000200 verify(s < s2 , "comparison failed on %s < %s" % (s, s2))
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000201
Marc-André Lemburge5034372000-08-08 08:04:29 +0000202 def test_fixup(s):
Fred Drake004d5e62000-10-23 17:22:08 +0000203 s2 = u'\ud800\udc01'
204 test_lecmp(s, s2)
205 s2 = u'\ud900\udc01'
206 test_lecmp(s, s2)
207 s2 = u'\uda00\udc01'
208 test_lecmp(s, s2)
209 s2 = u'\udb00\udc01'
210 test_lecmp(s, s2)
211 s2 = u'\ud800\udd01'
212 test_lecmp(s, s2)
213 s2 = u'\ud900\udd01'
214 test_lecmp(s, s2)
215 s2 = u'\uda00\udd01'
216 test_lecmp(s, s2)
217 s2 = u'\udb00\udd01'
218 test_lecmp(s, s2)
219 s2 = u'\ud800\ude01'
220 test_lecmp(s, s2)
221 s2 = u'\ud900\ude01'
222 test_lecmp(s, s2)
223 s2 = u'\uda00\ude01'
224 test_lecmp(s, s2)
225 s2 = u'\udb00\ude01'
226 test_lecmp(s, s2)
227 s2 = u'\ud800\udfff'
228 test_lecmp(s, s2)
229 s2 = u'\ud900\udfff'
230 test_lecmp(s, s2)
231 s2 = u'\uda00\udfff'
232 test_lecmp(s, s2)
233 s2 = u'\udb00\udfff'
234 test_lecmp(s, s2)
Marc-André Lemburge5034372000-08-08 08:04:29 +0000235
236 test_fixup(u'\ue000')
237 test_fixup(u'\uff61')
238
239 # Surrogates on both sides, no fixup required
Marc-André Lemburg36619082001-01-17 19:11:13 +0000240 verify(u'\ud800\udc02' < u'\ud84d\udc56')
Marc-André Lemburge5034372000-08-08 08:04:29 +0000241 print 'done.'
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000242
Guido van Rossuma831cac2000-03-10 23:23:21 +0000243test('ljust', u'abc', u'abc ', 10)
244test('rjust', u'abc', u' abc', 10)
245test('center', u'abc', u' abc ', 10)
246test('ljust', u'abc', u'abc ', 6)
247test('rjust', u'abc', u' abc', 6)
248test('center', u'abc', u' abc ', 6)
249test('ljust', u'abc', u'abc', 2)
250test('rjust', u'abc', u'abc', 2)
251test('center', u'abc', u'abc', 2)
252
253test('islower', u'a', 1)
254test('islower', u'A', 0)
255test('islower', u'\n', 0)
256test('islower', u'\u1FFc', 0)
257test('islower', u'abc', 1)
258test('islower', u'aBc', 0)
259test('islower', u'abc\n', 1)
260
261test('isupper', u'a', 0)
262test('isupper', u'A', 1)
263test('isupper', u'\n', 0)
Marc-André Lemburgef0a0322001-02-10 14:09:31 +0000264if sys.platform[:4] != 'java':
265 test('isupper', u'\u1FFc', 0)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000266test('isupper', u'ABC', 1)
267test('isupper', u'AbC', 0)
268test('isupper', u'ABC\n', 1)
269
270test('istitle', u'a', 0)
271test('istitle', u'A', 1)
272test('istitle', u'\n', 0)
273test('istitle', u'\u1FFc', 1)
274test('istitle', u'A Titlecased Line', 1)
275test('istitle', u'A\nTitlecased Line', 1)
276test('istitle', u'A Titlecased, Line', 1)
277test('istitle', u'Greek \u1FFcitlecases ...', 1)
278test('istitle', u'Not a capitalized String', 0)
279test('istitle', u'Not\ta Titlecase String', 0)
280test('istitle', u'Not--a Titlecase String', 0)
281
Marc-André Lemburg9d467412000-07-05 09:46:40 +0000282test('isalpha', u'a', 1)
283test('isalpha', u'A', 1)
284test('isalpha', u'\n', 0)
285test('isalpha', u'\u1FFc', 1)
286test('isalpha', u'abc', 1)
287test('isalpha', u'aBc123', 0)
288test('isalpha', u'abc\n', 0)
289
290test('isalnum', u'a', 1)
291test('isalnum', u'A', 1)
292test('isalnum', u'\n', 0)
293test('isalnum', u'123abc456', 1)
294test('isalnum', u'a1b3c', 1)
295test('isalnum', u'aBc000 ', 0)
296test('isalnum', u'abc\n', 0)
297
Guido van Rossuma831cac2000-03-10 23:23:21 +0000298test('splitlines', u"abc\ndef\n\rghi", [u'abc', u'def', u'', u'ghi'])
299test('splitlines', u"abc\ndef\n\r\nghi", [u'abc', u'def', u'', u'ghi'])
300test('splitlines', u"abc\ndef\r\nghi", [u'abc', u'def', u'ghi'])
301test('splitlines', u"abc\ndef\r\nghi\n", [u'abc', u'def', u'ghi'])
302test('splitlines', u"abc\ndef\r\nghi\n\r", [u'abc', u'def', u'ghi', u''])
303test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'', u'abc', u'def', u'ghi', u''])
Guido van Rossum7ee801d2000-04-11 15:37:02 +0000304test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'\n', u'abc\n', u'def\r\n', u'ghi\n', u'\r'], 1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000305
306test('translate', u"abababc", u'bbbc', {ord('a'):None})
307test('translate', u"abababc", u'iiic', {ord('a'):None, ord('b'):ord('i')})
308test('translate', u"abababc", u'iiix', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
309
Guido van Rossumd4d26842000-03-13 23:21:48 +0000310# Contains:
311print 'Testing Unicode contains method...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000312verify(('a' in u'abdb') == 1)
313verify(('a' in u'bdab') == 1)
314verify(('a' in u'bdaba') == 1)
315verify(('a' in u'bdba') == 1)
316verify(('a' in u'bdba') == 1)
317verify((u'a' in u'bdba') == 1)
318verify((u'a' in u'bdb') == 0)
319verify((u'a' in 'bdb') == 0)
320verify((u'a' in 'bdba') == 1)
321verify((u'a' in ('a',1,None)) == 1)
322verify((u'a' in (1,None,'a')) == 1)
323verify((u'a' in (1,None,u'a')) == 1)
324verify(('a' in ('a',1,None)) == 1)
325verify(('a' in (1,None,'a')) == 1)
326verify(('a' in (1,None,u'a')) == 1)
327verify(('a' in ('x',1,u'y')) == 0)
328verify(('a' in ('x',1,None)) == 0)
Guido van Rossumd4d26842000-03-13 23:21:48 +0000329print 'done.'
330
Guido van Rossuma831cac2000-03-10 23:23:21 +0000331# Formatting:
332print 'Testing Unicode formatting strings...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000333verify(u"%s, %s" % (u"abc", "abc") == u'abc, abc')
334verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3) == u'abc, abc, 1, 2.000000, 3.00')
335verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3) == u'abc, abc, 1, -2.000000, 3.00')
336verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5) == u'abc, abc, -1, -2.000000, 3.50')
337verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57) == u'abc, abc, -1, -2.000000, 3.57')
338verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57) == u'abc, abc, -1, -2.000000, 1003.57')
339verify(u"%c" % (u"a",) == u'a')
340verify(u"%c" % ("a",) == u'a')
341verify(u"%c" % (34,) == u'"')
342verify(u"%c" % (36,) == u'$')
Marc-André Lemburgef0a0322001-02-10 14:09:31 +0000343if sys.platform[:4] != 'java':
344 value = u"%r, %r" % (u"abc", "abc")
345 if value != u"u'abc', 'abc'":
346 print '*** formatting failed for "%s"' % 'u"%r, %r" % (u"abc", "abc")'
Marc-André Lemburg84625732000-06-13 12:05:36 +0000347
Marc-André Lemburg36619082001-01-17 19:11:13 +0000348verify(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"} == u'abc, def')
Marc-André Lemburg84625732000-06-13 12:05:36 +0000349try:
Marc-André Lemburgef0a0322001-02-10 14:09:31 +0000350 if sys.platform[:4] != 'java':
351 value = u"%(x)s, %(ä)s" % {'x':u"abc", u'ä'.encode('utf-8'):"def"}
352 else:
353 value = u"%(x)s, %(ä)s" % {'x':u"abc", u'ä':"def"}
Marc-André Lemburg84625732000-06-13 12:05:36 +0000354except KeyError:
355 print '*** formatting failed for "%s"' % "u'abc, def'"
356else:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000357 verify(value == u'abc, def')
Marc-André Lemburg84625732000-06-13 12:05:36 +0000358
Guido van Rossum97064862000-04-10 13:52:48 +0000359# formatting jobs delegated from the string implementation:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000360verify('...%(foo)s...' % {'foo':u"abc"} == u'...abc...')
361verify('...%(foo)s...' % {'foo':"abc"} == '...abc...')
362verify('...%(foo)s...' % {u'foo':"abc"} == '...abc...')
363verify('...%(foo)s...' % {u'foo':u"abc"} == u'...abc...')
364verify('...%(foo)s...' % {u'foo':u"abc",'def':123} == u'...abc...')
365verify('...%(foo)s...' % {u'foo':u"abc",u'def':123} == u'...abc...')
366verify('...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...1...2...3...abc...')
367verify('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...%...%s...1...2...3...abc...')
368verify('...%s...' % u"abc" == u'...abc...')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000369print 'done.'
370
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000371# Test builtin codecs
372print 'Testing builtin codecs...',
373
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000374# UTF-8 specific encoding tests:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000375verify(u'\u20ac'.encode('utf-8') == \
376 ''.join((chr(0xe2), chr(0x82), chr(0xac))) )
377verify(u'\ud800\udc02'.encode('utf-8') == \
378 ''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))) )
379verify(u'\ud84d\udc56'.encode('utf-8') == \
380 ''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))) )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000381# UTF-8 specific decoding tests
Tim Petersd2bf3b72001-01-18 02:22:22 +0000382verify(unicode(''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))),
Marc-André Lemburg36619082001-01-17 19:11:13 +0000383 'utf-8') == u'\ud84d\udc56' )
Tim Petersd2bf3b72001-01-18 02:22:22 +0000384verify(unicode(''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))),
Marc-André Lemburg36619082001-01-17 19:11:13 +0000385 'utf-8') == u'\ud800\udc02' )
Tim Petersd2bf3b72001-01-18 02:22:22 +0000386verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))),
Marc-André Lemburg36619082001-01-17 19:11:13 +0000387 'utf-8') == u'\u20ac' )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000388
389# Other possible utf-8 test cases:
390# * strict decoding testing for all of the
391# UTF8_ERROR cases in PyUnicode_DecodeUTF8
392
393
394
Marc-André Lemburg36619082001-01-17 19:11:13 +0000395verify(unicode('hello','ascii') == u'hello')
396verify(unicode('hello','utf-8') == u'hello')
397verify(unicode('hello','utf8') == u'hello')
398verify(unicode('hello','latin-1') == u'hello')
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000399
Marc-André Lemburgb6d78fcd2000-07-07 13:46:19 +0000400class String:
401 x = ''
402 def __str__(self):
403 return self.x
404
405o = String()
406
407o.x = 'abc'
Marc-André Lemburg36619082001-01-17 19:11:13 +0000408verify(unicode(o) == u'abc')
409verify(str(o) == 'abc')
Marc-André Lemburgb6d78fcd2000-07-07 13:46:19 +0000410
411o.x = u'abc'
Marc-André Lemburg36619082001-01-17 19:11:13 +0000412verify(unicode(o) == u'abc')
413verify(str(o) == 'abc')
Marc-André Lemburgb6d78fcd2000-07-07 13:46:19 +0000414
Guido van Rossum97064862000-04-10 13:52:48 +0000415try:
416 u'Andr\202 x'.encode('ascii')
417 u'Andr\202 x'.encode('ascii','strict')
418except ValueError:
419 pass
420else:
Guido van Rossuma1374e42001-01-19 19:01:56 +0000421 raise TestFailed, "u'Andr\202'.encode('ascii') failed to raise an exception"
Marc-André Lemburg36619082001-01-17 19:11:13 +0000422verify(u'Andr\202 x'.encode('ascii','ignore') == "Andr x")
423verify(u'Andr\202 x'.encode('ascii','replace') == "Andr? x")
Guido van Rossum97064862000-04-10 13:52:48 +0000424
425try:
426 unicode('Andr\202 x','ascii')
427 unicode('Andr\202 x','ascii','strict')
428except ValueError:
429 pass
430else:
Guido van Rossuma1374e42001-01-19 19:01:56 +0000431 raise TestFailed, "unicode('Andr\202') failed to raise an exception"
Marc-André Lemburg36619082001-01-17 19:11:13 +0000432verify(unicode('Andr\202 x','ascii','ignore') == u"Andr x")
433verify(unicode('Andr\202 x','ascii','replace') == u'Andr\uFFFD x')
Guido van Rossum97064862000-04-10 13:52:48 +0000434
Marc-André Lemburg36619082001-01-17 19:11:13 +0000435verify(u'hello'.encode('ascii') == 'hello')
436verify(u'hello'.encode('utf-8') == 'hello')
437verify(u'hello'.encode('utf8') == 'hello')
438verify(u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000')
439verify(u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o')
440verify(u'hello'.encode('latin-1') == 'hello')
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000441
442u = u''.join(map(unichr, range(1024)))
443for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
444 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
Marc-André Lemburg36619082001-01-17 19:11:13 +0000445 verify(unicode(u.encode(encoding),encoding) == u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000446
447u = u''.join(map(unichr, range(256)))
Guido van Rossum9e896b32000-04-05 20:11:21 +0000448for encoding in (
449 'latin-1',
450 ):
451 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000452 verify(unicode(u.encode(encoding),encoding) == u)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000453 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000454 print '*** codec "%s" failed round-trip' % encoding
455 except ValueError,why:
456 print '*** codec for "%s" failed: %s' % (encoding, why)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000457
458u = u''.join(map(unichr, range(128)))
Guido van Rossum9e896b32000-04-05 20:11:21 +0000459for encoding in (
460 'ascii',
461 ):
462 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000463 verify(unicode(u.encode(encoding),encoding) == u)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000464 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000465 print '*** codec "%s" failed round-trip' % encoding
466 except ValueError,why:
467 print '*** codec for "%s" failed: %s' % (encoding, why)
468
469print 'done.'
470
471print 'Testing standard mapping codecs...',
472
473print '0-127...',
474s = ''.join(map(chr, range(128)))
475for encoding in (
476 'cp037', 'cp1026',
477 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
478 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
Fred Drake004d5e62000-10-23 17:22:08 +0000479 'cp863', 'cp865', 'cp866',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000480 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
481 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
482 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
483 'mac_cyrillic', 'mac_latin2',
484
485 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
486 'cp1256', 'cp1257', 'cp1258',
487 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
488
489 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
490 'cp1006', 'cp875', 'iso8859_8',
Fred Drake004d5e62000-10-23 17:22:08 +0000491
Guido van Rossum9e896b32000-04-05 20:11:21 +0000492 ### These have undefined mappings:
493 #'cp424',
Fred Drake004d5e62000-10-23 17:22:08 +0000494
Guido van Rossum9e896b32000-04-05 20:11:21 +0000495 ):
496 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000497 verify(unicode(s,encoding).encode(encoding) == s)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000498 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000499 print '*** codec "%s" failed round-trip' % encoding
500 except ValueError,why:
501 print '*** codec for "%s" failed: %s' % (encoding, why)
502
503print '128-255...',
504s = ''.join(map(chr, range(128,256)))
505for encoding in (
506 'cp037', 'cp1026',
507 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
508 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
Fred Drake004d5e62000-10-23 17:22:08 +0000509 'cp863', 'cp865', 'cp866',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000510 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
Tim Petersd2bf3b72001-01-18 02:22:22 +0000511 'iso8859_2', 'iso8859_4', 'iso8859_5',
Marc-André Lemburga866df82001-01-03 21:29:14 +0000512 'iso8859_9', 'koi8_r', 'latin_1',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000513 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +0000514
Guido van Rossum9e896b32000-04-05 20:11:21 +0000515 ### These have undefined mappings:
516 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
517 #'cp1256', 'cp1257', 'cp1258',
518 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Tim Petersd2bf3b72001-01-18 02:22:22 +0000519 #'iso8859_3', 'iso8859_6', 'iso8859_7',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000520 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +0000521
Guido van Rossum9e896b32000-04-05 20:11:21 +0000522 ### These fail the round-trip:
523 #'cp1006', 'cp875', 'iso8859_8',
Fred Drake004d5e62000-10-23 17:22:08 +0000524
Guido van Rossum9e896b32000-04-05 20:11:21 +0000525 ):
526 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000527 verify(unicode(s,encoding).encode(encoding) == s)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000528 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000529 print '*** codec "%s" failed round-trip' % encoding
530 except ValueError,why:
531 print '*** codec for "%s" failed: %s' % (encoding, why)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000532
533print 'done.'
Fred Drakee0243e22000-04-13 14:11:56 +0000534
535print 'Testing Unicode string concatenation...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000536verify((u"abc" u"def") == u"abcdef")
537verify(("abc" u"def") == u"abcdef")
538verify((u"abc" "def") == u"abcdef")
539verify((u"abc" u"def" "ghi") == u"abcdefghi")
540verify(("abc" "def" u"ghi") == u"abcdefghi")
Fred Drakee0243e22000-04-13 14:11:56 +0000541print 'done.'