blob: d508bef7574dcdb6c164b6b14358291434a347c3 [file] [log] [blame]
Guido van Rossuma831cac2000-03-10 23:23:21 +00001""" Test script for the Unicode implementation.
2
Guido van Rossuma831cac2000-03-10 23:23:21 +00003Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
Marc-André Lemburg36619082001-01-17 19:11:13 +00007"""#"
Tim Peters2f228e72001-05-13 00:19:31 +00008from test_support import verify, verbose, TestFailed
Guido van Rossuma831cac2000-03-10 23:23:21 +00009import sys
10
11def test(method, input, output, *args):
12 if verbose:
Guido van Rossum15ffc712000-11-29 12:13:59 +000013 print '%s.%s%s =? %s... ' % (repr(input), method, args, repr(output)),
Guido van Rossuma831cac2000-03-10 23:23:21 +000014 try:
15 f = getattr(input, method)
16 value = apply(f, args)
17 except:
18 value = sys.exc_type
Guido van Rossum66503202000-04-28 20:39:58 +000019 exc = sys.exc_info()[:2]
Guido van Rossuma831cac2000-03-10 23:23:21 +000020 else:
21 exc = None
Guido van Rossum15ffc712000-11-29 12:13:59 +000022 if value != output or type(value) is not type(output):
Guido van Rossuma831cac2000-03-10 23:23:21 +000023 if verbose:
24 print 'no'
25 print '*',f, `input`, `output`, `value`
26 if exc:
Guido van Rossum66503202000-04-28 20:39:58 +000027 print ' value == %s: %s' % (exc)
Guido van Rossuma831cac2000-03-10 23:23:21 +000028 else:
29 if verbose:
30 print 'yes'
31
32test('capitalize', u' hello ', u' hello ')
33test('capitalize', u'hello ', u'Hello ')
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000034test('capitalize', u'aaaa', u'Aaaa')
35test('capitalize', u'AaAa', u'Aaaa')
Guido van Rossuma831cac2000-03-10 23:23:21 +000036
Marc-André Lemburg3a645e42001-01-16 11:54:12 +000037test('count', u'aaa', 3, u'a')
38test('count', u'aaa', 0, u'b')
39test('count', 'aaa', 3, u'a')
40test('count', 'aaa', 0, u'b')
41test('count', u'aaa', 3, 'a')
42test('count', u'aaa', 0, 'b')
43
Guido van Rossuma831cac2000-03-10 23:23:21 +000044test('title', u' hello ', u' Hello ')
45test('title', u'hello ', u'Hello ')
46test('title', u"fOrMaT thIs aS titLe String", u'Format This As Title String')
47test('title', u"fOrMaT,thIs-aS*titLe;String", u'Format,This-As*Title;String')
48test('title', u"getInt", u'Getint')
49
50test('find', u'abcdefghiabc', 0, u'abc')
51test('find', u'abcdefghiabc', 9, u'abc', 1)
52test('find', u'abcdefghiabc', -1, u'def', 4)
53
54test('rfind', u'abcdefghiabc', 9, u'abc')
55
56test('lower', u'HeLLo', u'hello')
57test('lower', u'hello', u'hello')
58
59test('upper', u'HeLLo', u'HELLO')
60test('upper', u'HELLO', u'HELLO')
61
62if 0:
63 transtable = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`xyzdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377'
64
65 test('maketrans', u'abc', transtable, u'xyz')
66 test('maketrans', u'abc', ValueError, u'xyzq')
67
68test('split', u'this is the split function',
69 [u'this', u'is', u'the', u'split', u'function'])
70test('split', u'a|b|c|d', [u'a', u'b', u'c', u'd'], u'|')
71test('split', u'a|b|c|d', [u'a', u'b', u'c|d'], u'|', 2)
72test('split', u'a b c d', [u'a', u'b c d'], None, 1)
73test('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
74test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 3)
75test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 4)
76test('split', u'a b c d', [u'a b c d'], None, 0)
77test('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
78test('split', u'a b c d ', [u'a', u'b', u'c', u'd'])
Guido van Rossum8b264542000-12-19 02:22:31 +000079test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
80test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], '//')
81test('split', 'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
82test('split', u'endcase test', [u'endcase ', u''], u'test')
83test('split', u'endcase test', [u'endcase ', u''], 'test')
84test('split', 'endcase test', [u'endcase ', u''], u'test')
85
Guido van Rossuma831cac2000-03-10 23:23:21 +000086
87# join now works with any sequence type
88class Sequence:
Guido van Rossum15ffc712000-11-29 12:13:59 +000089 def __init__(self, seq): self.seq = seq
Guido van Rossuma831cac2000-03-10 23:23:21 +000090 def __len__(self): return len(self.seq)
91 def __getitem__(self, i): return self.seq[i]
92
93test('join', u' ', u'a b c d', [u'a', u'b', u'c', u'd'])
Guido van Rossum15ffc712000-11-29 12:13:59 +000094test('join', u' ', u'a b c d', ['a', 'b', u'c', u'd'])
Guido van Rossuma831cac2000-03-10 23:23:21 +000095test('join', u'', u'abcd', (u'a', u'b', u'c', u'd'))
Guido van Rossum15ffc712000-11-29 12:13:59 +000096test('join', u' ', u'w x y z', Sequence('wxyz'))
Guido van Rossuma831cac2000-03-10 23:23:21 +000097test('join', u' ', TypeError, 7)
Guido van Rossum15ffc712000-11-29 12:13:59 +000098test('join', u' ', TypeError, Sequence([7, u'hello', 123L]))
99test('join', ' ', u'a b c d', [u'a', u'b', u'c', u'd'])
100test('join', ' ', u'a b c d', ['a', 'b', u'c', u'd'])
101test('join', '', u'abcd', (u'a', u'b', u'c', u'd'))
102test('join', ' ', u'w x y z', Sequence(u'wxyz'))
103test('join', ' ', TypeError, 7)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000104
105result = u''
106for i in range(10):
107 if i > 0:
108 result = result + u':'
109 result = result + u'x'*10
110test('join', u':', result, [u'x' * 10] * 10)
111test('join', u':', result, (u'x' * 10,) * 10)
112
113test('strip', u' hello ', u'hello')
114test('lstrip', u' hello ', u'hello ')
115test('rstrip', u' hello ', u' hello')
116test('strip', u'hello', u'hello')
117
118test('swapcase', u'HeLLo cOmpUteRs', u'hEllO CoMPuTErS')
119
120if 0:
121 test('translate', u'xyzabcdef', u'xyzxyz', transtable, u'def')
122
123 table = string.maketrans('a', u'A')
124 test('translate', u'abc', u'Abc', table)
125 test('translate', u'xyz', u'xyz', table)
126
127test('replace', u'one!two!three!', u'one@two!three!', u'!', u'@', 1)
Barry Warsaw51ac5802000-03-20 16:36:48 +0000128test('replace', u'one!two!three!', u'onetwothree', '!', '')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000129test('replace', u'one!two!three!', u'one@two@three!', u'!', u'@', 2)
130test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 3)
131test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 4)
132test('replace', u'one!two!three!', u'one!two!three!', u'!', u'@', 0)
133test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@')
134test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@')
135test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@', 2)
136
137test('startswith', u'hello', 1, u'he')
138test('startswith', u'hello', 1, u'hello')
139test('startswith', u'hello', 0, u'hello world')
140test('startswith', u'hello', 1, u'')
141test('startswith', u'hello', 0, u'ello')
142test('startswith', u'hello', 1, u'ello', 1)
143test('startswith', u'hello', 1, u'o', 4)
144test('startswith', u'hello', 0, u'o', 5)
145test('startswith', u'hello', 1, u'', 5)
146test('startswith', u'hello', 0, u'lo', 6)
147test('startswith', u'helloworld', 1, u'lowo', 3)
148test('startswith', u'helloworld', 1, u'lowo', 3, 7)
149test('startswith', u'helloworld', 0, u'lowo', 3, 6)
150
151test('endswith', u'hello', 1, u'lo')
152test('endswith', u'hello', 0, u'he')
153test('endswith', u'hello', 1, u'')
154test('endswith', u'hello', 0, u'hello world')
155test('endswith', u'helloworld', 0, u'worl')
156test('endswith', u'helloworld', 1, u'worl', 3, 9)
157test('endswith', u'helloworld', 1, u'world', 3, 12)
158test('endswith', u'helloworld', 1, u'lowo', 1, 7)
159test('endswith', u'helloworld', 1, u'lowo', 2, 7)
160test('endswith', u'helloworld', 1, u'lowo', 3, 7)
161test('endswith', u'helloworld', 0, u'lowo', 4, 7)
162test('endswith', u'helloworld', 0, u'lowo', 3, 8)
163test('endswith', u'ab', 0, u'ab', 0, 1)
164test('endswith', u'ab', 0, u'ab', 0, 0)
165
166test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi')
167test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi', 8)
168test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi', 4)
169test('expandtabs', u'abc\r\nab\tdef\ng\thi', u'abc\r\nab def\ng hi', 4)
170
171if 0:
172 test('capwords', u'abc def ghi', u'Abc Def Ghi')
173 test('capwords', u'abc\tdef\nghi', u'Abc Def Ghi')
174 test('capwords', u'abc\t def \nghi', u'Abc Def Ghi')
175
176# Comparisons:
177print 'Testing Unicode comparisons...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000178verify(u'abc' == 'abc')
179verify('abc' == u'abc')
180verify(u'abc' == u'abc')
181verify(u'abcd' > 'abc')
182verify('abcd' > u'abc')
183verify(u'abcd' > u'abc')
184verify(u'abc' < 'abcd')
185verify('abc' < u'abcd')
186verify(u'abc' < u'abcd')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000187print 'done.'
188
Marc-André Lemburge5034372000-08-08 08:04:29 +0000189if 0:
190 # Move these tests to a Unicode collation module test...
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000191
Marc-André Lemburge5034372000-08-08 08:04:29 +0000192 print 'Testing UTF-16 code point order comparisons...',
193 #No surrogates, no fixup required.
Marc-André Lemburg36619082001-01-17 19:11:13 +0000194 verify(u'\u0061' < u'\u20ac')
Marc-André Lemburge5034372000-08-08 08:04:29 +0000195 # Non surrogate below surrogate value, no fixup required
Marc-André Lemburg36619082001-01-17 19:11:13 +0000196 verify(u'\u0061' < u'\ud800\udc02')
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000197
Marc-André Lemburge5034372000-08-08 08:04:29 +0000198 # Non surrogate above surrogate value, fixup required
199 def test_lecmp(s, s2):
Tim Petersd2bf3b72001-01-18 02:22:22 +0000200 verify(s < s2 , "comparison failed on %s < %s" % (s, s2))
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000201
Marc-André Lemburge5034372000-08-08 08:04:29 +0000202 def test_fixup(s):
Fred Drake004d5e62000-10-23 17:22:08 +0000203 s2 = u'\ud800\udc01'
204 test_lecmp(s, s2)
205 s2 = u'\ud900\udc01'
206 test_lecmp(s, s2)
207 s2 = u'\uda00\udc01'
208 test_lecmp(s, s2)
209 s2 = u'\udb00\udc01'
210 test_lecmp(s, s2)
211 s2 = u'\ud800\udd01'
212 test_lecmp(s, s2)
213 s2 = u'\ud900\udd01'
214 test_lecmp(s, s2)
215 s2 = u'\uda00\udd01'
216 test_lecmp(s, s2)
217 s2 = u'\udb00\udd01'
218 test_lecmp(s, s2)
219 s2 = u'\ud800\ude01'
220 test_lecmp(s, s2)
221 s2 = u'\ud900\ude01'
222 test_lecmp(s, s2)
223 s2 = u'\uda00\ude01'
224 test_lecmp(s, s2)
225 s2 = u'\udb00\ude01'
226 test_lecmp(s, s2)
227 s2 = u'\ud800\udfff'
228 test_lecmp(s, s2)
229 s2 = u'\ud900\udfff'
230 test_lecmp(s, s2)
231 s2 = u'\uda00\udfff'
232 test_lecmp(s, s2)
233 s2 = u'\udb00\udfff'
234 test_lecmp(s, s2)
Marc-André Lemburge5034372000-08-08 08:04:29 +0000235
236 test_fixup(u'\ue000')
237 test_fixup(u'\uff61')
238
239 # Surrogates on both sides, no fixup required
Marc-André Lemburg36619082001-01-17 19:11:13 +0000240 verify(u'\ud800\udc02' < u'\ud84d\udc56')
Marc-André Lemburge5034372000-08-08 08:04:29 +0000241 print 'done.'
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000242
Guido van Rossuma831cac2000-03-10 23:23:21 +0000243test('ljust', u'abc', u'abc ', 10)
244test('rjust', u'abc', u' abc', 10)
245test('center', u'abc', u' abc ', 10)
246test('ljust', u'abc', u'abc ', 6)
247test('rjust', u'abc', u' abc', 6)
248test('center', u'abc', u' abc ', 6)
249test('ljust', u'abc', u'abc', 2)
250test('rjust', u'abc', u'abc', 2)
251test('center', u'abc', u'abc', 2)
252
253test('islower', u'a', 1)
254test('islower', u'A', 0)
255test('islower', u'\n', 0)
256test('islower', u'\u1FFc', 0)
257test('islower', u'abc', 1)
258test('islower', u'aBc', 0)
259test('islower', u'abc\n', 1)
260
261test('isupper', u'a', 0)
262test('isupper', u'A', 1)
263test('isupper', u'\n', 0)
Marc-André Lemburgef0a0322001-02-10 14:09:31 +0000264if sys.platform[:4] != 'java':
265 test('isupper', u'\u1FFc', 0)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000266test('isupper', u'ABC', 1)
267test('isupper', u'AbC', 0)
268test('isupper', u'ABC\n', 1)
269
270test('istitle', u'a', 0)
271test('istitle', u'A', 1)
272test('istitle', u'\n', 0)
273test('istitle', u'\u1FFc', 1)
274test('istitle', u'A Titlecased Line', 1)
275test('istitle', u'A\nTitlecased Line', 1)
276test('istitle', u'A Titlecased, Line', 1)
277test('istitle', u'Greek \u1FFcitlecases ...', 1)
278test('istitle', u'Not a capitalized String', 0)
279test('istitle', u'Not\ta Titlecase String', 0)
280test('istitle', u'Not--a Titlecase String', 0)
281
Marc-André Lemburg9d467412000-07-05 09:46:40 +0000282test('isalpha', u'a', 1)
283test('isalpha', u'A', 1)
284test('isalpha', u'\n', 0)
285test('isalpha', u'\u1FFc', 1)
286test('isalpha', u'abc', 1)
287test('isalpha', u'aBc123', 0)
288test('isalpha', u'abc\n', 0)
289
290test('isalnum', u'a', 1)
291test('isalnum', u'A', 1)
292test('isalnum', u'\n', 0)
293test('isalnum', u'123abc456', 1)
294test('isalnum', u'a1b3c', 1)
295test('isalnum', u'aBc000 ', 0)
296test('isalnum', u'abc\n', 0)
297
Guido van Rossuma831cac2000-03-10 23:23:21 +0000298test('splitlines', u"abc\ndef\n\rghi", [u'abc', u'def', u'', u'ghi'])
299test('splitlines', u"abc\ndef\n\r\nghi", [u'abc', u'def', u'', u'ghi'])
300test('splitlines', u"abc\ndef\r\nghi", [u'abc', u'def', u'ghi'])
301test('splitlines', u"abc\ndef\r\nghi\n", [u'abc', u'def', u'ghi'])
302test('splitlines', u"abc\ndef\r\nghi\n\r", [u'abc', u'def', u'ghi', u''])
303test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'', u'abc', u'def', u'ghi', u''])
Guido van Rossum7ee801d2000-04-11 15:37:02 +0000304test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'\n', u'abc\n', u'def\r\n', u'ghi\n', u'\r'], 1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000305
306test('translate', u"abababc", u'bbbc', {ord('a'):None})
307test('translate', u"abababc", u'iiic', {ord('a'):None, ord('b'):ord('i')})
308test('translate', u"abababc", u'iiix', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
309
Guido van Rossumd4d26842000-03-13 23:21:48 +0000310# Contains:
311print 'Testing Unicode contains method...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000312verify(('a' in u'abdb') == 1)
313verify(('a' in u'bdab') == 1)
314verify(('a' in u'bdaba') == 1)
315verify(('a' in u'bdba') == 1)
316verify(('a' in u'bdba') == 1)
317verify((u'a' in u'bdba') == 1)
318verify((u'a' in u'bdb') == 0)
319verify((u'a' in 'bdb') == 0)
320verify((u'a' in 'bdba') == 1)
321verify((u'a' in ('a',1,None)) == 1)
322verify((u'a' in (1,None,'a')) == 1)
323verify((u'a' in (1,None,u'a')) == 1)
324verify(('a' in ('a',1,None)) == 1)
325verify(('a' in (1,None,'a')) == 1)
326verify(('a' in (1,None,u'a')) == 1)
327verify(('a' in ('x',1,u'y')) == 0)
328verify(('a' in ('x',1,None)) == 0)
Guido van Rossumd4d26842000-03-13 23:21:48 +0000329print 'done.'
330
Guido van Rossuma831cac2000-03-10 23:23:21 +0000331# Formatting:
332print 'Testing Unicode formatting strings...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000333verify(u"%s, %s" % (u"abc", "abc") == u'abc, abc')
334verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3) == u'abc, abc, 1, 2.000000, 3.00')
335verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3) == u'abc, abc, 1, -2.000000, 3.00')
336verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5) == u'abc, abc, -1, -2.000000, 3.50')
337verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57) == u'abc, abc, -1, -2.000000, 3.57')
338verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57) == u'abc, abc, -1, -2.000000, 1003.57')
339verify(u"%c" % (u"a",) == u'a')
340verify(u"%c" % ("a",) == u'a')
341verify(u"%c" % (34,) == u'"')
342verify(u"%c" % (36,) == u'$')
Marc-André Lemburgef0a0322001-02-10 14:09:31 +0000343if sys.platform[:4] != 'java':
344 value = u"%r, %r" % (u"abc", "abc")
345 if value != u"u'abc', 'abc'":
346 print '*** formatting failed for "%s"' % 'u"%r, %r" % (u"abc", "abc")'
Marc-André Lemburg84625732000-06-13 12:05:36 +0000347
Marc-André Lemburg36619082001-01-17 19:11:13 +0000348verify(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"} == u'abc, def')
Marc-André Lemburg84625732000-06-13 12:05:36 +0000349try:
Marc-André Lemburgef0a0322001-02-10 14:09:31 +0000350 if sys.platform[:4] != 'java':
351 value = u"%(x)s, %(ä)s" % {'x':u"abc", u'ä'.encode('utf-8'):"def"}
352 else:
353 value = u"%(x)s, %(ä)s" % {'x':u"abc", u'ä':"def"}
Marc-André Lemburg84625732000-06-13 12:05:36 +0000354except KeyError:
355 print '*** formatting failed for "%s"' % "u'abc, def'"
356else:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000357 verify(value == u'abc, def')
Marc-André Lemburg84625732000-06-13 12:05:36 +0000358
Guido van Rossum97064862000-04-10 13:52:48 +0000359# formatting jobs delegated from the string implementation:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000360verify('...%(foo)s...' % {'foo':u"abc"} == u'...abc...')
361verify('...%(foo)s...' % {'foo':"abc"} == '...abc...')
362verify('...%(foo)s...' % {u'foo':"abc"} == '...abc...')
363verify('...%(foo)s...' % {u'foo':u"abc"} == u'...abc...')
364verify('...%(foo)s...' % {u'foo':u"abc",'def':123} == u'...abc...')
365verify('...%(foo)s...' % {u'foo':u"abc",u'def':123} == u'...abc...')
366verify('...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...1...2...3...abc...')
367verify('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...%...%s...1...2...3...abc...')
368verify('...%s...' % u"abc" == u'...abc...')
Marc-André Lemburg542fe562001-05-02 14:21:53 +0000369verify('%*s' % (5,u'abc',) == u' abc')
370verify('%*s' % (-5,u'abc',) == u'abc ')
371verify('%*.*s' % (5,2,u'abc',) == u' ab')
372verify('%*.*s' % (5,3,u'abc',) == u' abc')
373verify('%i %*.*s' % (10, 5,3,u'abc',) == u'10 abc')
374verify('%i%s %*.*s' % (10, 3, 5,3,u'abc',) == u'103 abc')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000375print 'done.'
376
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000377# Test builtin codecs
378print 'Testing builtin codecs...',
379
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000380# UTF-7 specific encoding tests:
381utfTests = [(u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
382 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
383 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
384 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
385 (u'+', '+-'),
386 (u'+-', '+--'),
387 (u'+?', '+-?'),
388 (u'\?', '+AFw?'),
389 (u'+?', '+-?'),
390 (ur'\\?', '+AFwAXA?'),
391 (ur'\\\?', '+AFwAXABc?'),
392 (ur'++--', '+-+---')]
393
394for x,y in utfTests:
395 verify( x.encode('utf-7') == y )
396
397try:
398 unicode('+3ADYAA-', 'utf-7') # surrogates not supported
399except UnicodeError:
400 pass
401else:
402 raise TestFailed, "unicode('+3ADYAA-', 'utf-7') failed to raise an exception"
403
404verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd')
405
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000406# UTF-8 specific encoding tests:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000407verify(u'\u20ac'.encode('utf-8') == \
408 ''.join((chr(0xe2), chr(0x82), chr(0xac))) )
409verify(u'\ud800\udc02'.encode('utf-8') == \
410 ''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))) )
411verify(u'\ud84d\udc56'.encode('utf-8') == \
412 ''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))) )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000413# UTF-8 specific decoding tests
Tim Petersd2bf3b72001-01-18 02:22:22 +0000414verify(unicode(''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))),
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000415 'utf-8') == u'\U00023456' )
Tim Petersd2bf3b72001-01-18 02:22:22 +0000416verify(unicode(''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))),
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000417 'utf-8') == u'\U00010002' )
Tim Petersd2bf3b72001-01-18 02:22:22 +0000418verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))),
Marc-André Lemburg36619082001-01-17 19:11:13 +0000419 'utf-8') == u'\u20ac' )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000420
421# Other possible utf-8 test cases:
422# * strict decoding testing for all of the
423# UTF8_ERROR cases in PyUnicode_DecodeUTF8
424
425
426
Marc-André Lemburg36619082001-01-17 19:11:13 +0000427verify(unicode('hello','ascii') == u'hello')
428verify(unicode('hello','utf-8') == u'hello')
429verify(unicode('hello','utf8') == u'hello')
430verify(unicode('hello','latin-1') == u'hello')
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000431
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000432# Compatibility to str():
Marc-André Lemburgb6d78fc2000-07-07 13:46:19 +0000433class String:
434 x = ''
435 def __str__(self):
436 return self.x
437
438o = String()
439
440o.x = 'abc'
Marc-André Lemburg36619082001-01-17 19:11:13 +0000441verify(unicode(o) == u'abc')
442verify(str(o) == 'abc')
Marc-André Lemburgb6d78fc2000-07-07 13:46:19 +0000443
444o.x = u'abc'
Marc-André Lemburg36619082001-01-17 19:11:13 +0000445verify(unicode(o) == u'abc')
446verify(str(o) == 'abc')
Marc-André Lemburgb6d78fc2000-07-07 13:46:19 +0000447
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000448for obj in (123, 123.45, 123L):
449 verify(unicode(obj) == unicode(str(obj)))
450
451# Error handling
Guido van Rossum97064862000-04-10 13:52:48 +0000452try:
453 u'Andr\202 x'.encode('ascii')
454 u'Andr\202 x'.encode('ascii','strict')
455except ValueError:
456 pass
457else:
Guido van Rossuma1374e42001-01-19 19:01:56 +0000458 raise TestFailed, "u'Andr\202'.encode('ascii') failed to raise an exception"
Marc-André Lemburg36619082001-01-17 19:11:13 +0000459verify(u'Andr\202 x'.encode('ascii','ignore') == "Andr x")
460verify(u'Andr\202 x'.encode('ascii','replace') == "Andr? x")
Guido van Rossum97064862000-04-10 13:52:48 +0000461
462try:
463 unicode('Andr\202 x','ascii')
464 unicode('Andr\202 x','ascii','strict')
465except ValueError:
466 pass
467else:
Guido van Rossuma1374e42001-01-19 19:01:56 +0000468 raise TestFailed, "unicode('Andr\202') failed to raise an exception"
Marc-André Lemburg36619082001-01-17 19:11:13 +0000469verify(unicode('Andr\202 x','ascii','ignore') == u"Andr x")
470verify(unicode('Andr\202 x','ascii','replace') == u'Andr\uFFFD x')
Guido van Rossum97064862000-04-10 13:52:48 +0000471
Marc-André Lemburg36619082001-01-17 19:11:13 +0000472verify(u'hello'.encode('ascii') == 'hello')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000473verify(u'hello'.encode('utf-7') == 'hello')
Marc-André Lemburg36619082001-01-17 19:11:13 +0000474verify(u'hello'.encode('utf-8') == 'hello')
475verify(u'hello'.encode('utf8') == 'hello')
476verify(u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000')
477verify(u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o')
478verify(u'hello'.encode('latin-1') == 'hello')
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000479
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000480# Roundtrip safety for BMP (just the first 1024 chars)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000481u = u''.join(map(unichr, range(1024)))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000482for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000483 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
Marc-André Lemburg36619082001-01-17 19:11:13 +0000484 verify(unicode(u.encode(encoding),encoding) == u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000485
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000486# Roundtrip safety for non-BMP (just a few chars)
487u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
488for encoding in ('utf-8',
489 'utf-16', 'utf-16-le', 'utf-16-be',
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +0000490 #'raw_unicode_escape',
491 'unicode_escape', 'unicode_internal'):
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000492 verify(unicode(u.encode(encoding),encoding) == u)
493
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000494u = u''.join(map(unichr, range(256)))
Guido van Rossum9e896b32000-04-05 20:11:21 +0000495for encoding in (
496 'latin-1',
497 ):
498 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000499 verify(unicode(u.encode(encoding),encoding) == u)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000500 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000501 print '*** codec "%s" failed round-trip' % encoding
502 except ValueError,why:
503 print '*** codec for "%s" failed: %s' % (encoding, why)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000504
505u = u''.join(map(unichr, range(128)))
Guido van Rossum9e896b32000-04-05 20:11:21 +0000506for encoding in (
507 'ascii',
508 ):
509 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000510 verify(unicode(u.encode(encoding),encoding) == u)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000511 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000512 print '*** codec "%s" failed round-trip' % encoding
513 except ValueError,why:
514 print '*** codec for "%s" failed: %s' % (encoding, why)
515
516print 'done.'
517
518print 'Testing standard mapping codecs...',
519
520print '0-127...',
521s = ''.join(map(chr, range(128)))
522for encoding in (
523 'cp037', 'cp1026',
524 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
525 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
Fred Drake004d5e62000-10-23 17:22:08 +0000526 'cp863', 'cp865', 'cp866',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000527 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
528 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
529 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
530 'mac_cyrillic', 'mac_latin2',
531
532 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
533 'cp1256', 'cp1257', 'cp1258',
534 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
535
536 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Tim Peters2f228e72001-05-13 00:19:31 +0000537 'cp1006', 'iso8859_8',
Fred Drake004d5e62000-10-23 17:22:08 +0000538
Guido van Rossum9e896b32000-04-05 20:11:21 +0000539 ### These have undefined mappings:
540 #'cp424',
Fred Drake004d5e62000-10-23 17:22:08 +0000541
Tim Peters2f228e72001-05-13 00:19:31 +0000542 ### These fail the round-trip:
543 #'cp875'
544
Guido van Rossum9e896b32000-04-05 20:11:21 +0000545 ):
546 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000547 verify(unicode(s,encoding).encode(encoding) == s)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000548 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000549 print '*** codec "%s" failed round-trip' % encoding
550 except ValueError,why:
551 print '*** codec for "%s" failed: %s' % (encoding, why)
552
553print '128-255...',
554s = ''.join(map(chr, range(128,256)))
555for encoding in (
556 'cp037', 'cp1026',
557 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
558 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
Fred Drake004d5e62000-10-23 17:22:08 +0000559 'cp863', 'cp865', 'cp866',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000560 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
Tim Petersd2bf3b72001-01-18 02:22:22 +0000561 'iso8859_2', 'iso8859_4', 'iso8859_5',
Marc-André Lemburga866df82001-01-03 21:29:14 +0000562 'iso8859_9', 'koi8_r', 'latin_1',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000563 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +0000564
Guido van Rossum9e896b32000-04-05 20:11:21 +0000565 ### These have undefined mappings:
566 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
567 #'cp1256', 'cp1257', 'cp1258',
568 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Tim Petersd2bf3b72001-01-18 02:22:22 +0000569 #'iso8859_3', 'iso8859_6', 'iso8859_7',
Guido van Rossum9e896b32000-04-05 20:11:21 +0000570 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +0000571
Guido van Rossum9e896b32000-04-05 20:11:21 +0000572 ### These fail the round-trip:
573 #'cp1006', 'cp875', 'iso8859_8',
Fred Drake004d5e62000-10-23 17:22:08 +0000574
Guido van Rossum9e896b32000-04-05 20:11:21 +0000575 ):
576 try:
Marc-André Lemburg36619082001-01-17 19:11:13 +0000577 verify(unicode(s,encoding).encode(encoding) == s)
Guido van Rossuma1374e42001-01-19 19:01:56 +0000578 except TestFailed:
Guido van Rossum9e896b32000-04-05 20:11:21 +0000579 print '*** codec "%s" failed round-trip' % encoding
580 except ValueError,why:
581 print '*** codec for "%s" failed: %s' % (encoding, why)
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000582
583print 'done.'
Fred Drakee0243e22000-04-13 14:11:56 +0000584
585print 'Testing Unicode string concatenation...',
Marc-André Lemburg36619082001-01-17 19:11:13 +0000586verify((u"abc" u"def") == u"abcdef")
587verify(("abc" u"def") == u"abcdef")
588verify((u"abc" "def") == u"abcdef")
589verify((u"abc" u"def" "ghi") == u"abcdefghi")
590verify(("abc" "def" u"ghi") == u"abcdefghi")
Fred Drakee0243e22000-04-13 14:11:56 +0000591print 'done.'