blob: 4f6f132e25614983ab33da0bb0a4cd4dd31e9bcc [file] [log] [blame]
Guido van Rossuma831cac2000-03-10 23:23:21 +00001""" Test script for the Unicode implementation.
2
Guido van Rossuma831cac2000-03-10 23:23:21 +00003Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
Marc-André Lemburg36619082001-01-17 19:11:13 +00007"""#"
Guido van Rossum98297ee2007-11-06 21:34:58 +00008import codecs
9import struct
10import sys
11import unittest
12import warnings
Benjamin Petersonee8712c2008-05-20 21:35:26 +000013from test import support, string_tests
Eric Smitha1eac722011-01-29 11:15:35 +000014import _string
Guido van Rossuma831cac2000-03-10 23:23:21 +000015
Ezio Melottia5c92b42011-08-23 00:37:08 +030016# decorator to skip tests on narrow builds
17requires_wide_build = unittest.skipIf(sys.maxunicode == 65535,
18 'requires wide build')
19
Neal Norwitz430f68b2005-11-24 22:00:56 +000020# Error handling (bad decoder return)
21def search_function(encoding):
22 def decode1(input, errors="strict"):
23 return 42 # not a tuple
24 def encode1(input, errors="strict"):
25 return 42 # not a tuple
26 def encode2(input, errors="strict"):
27 return (42, 42) # no unicode
28 def decode2(input, errors="strict"):
29 return (42, 42) # no unicode
30 if encoding=="test.unicode1":
31 return (encode1, decode1, None, None)
32 elif encoding=="test.unicode2":
33 return (encode2, decode2, None, None)
34 else:
35 return None
36codecs.register(search_function)
37
Brett Cannon226b2302010-03-20 22:22:22 +000038class UnicodeTest(string_tests.CommonTest,
39 string_tests.MixinStrUnicodeUserStringTest,
40 string_tests.MixinStrUnicodeTest):
41
Guido van Rossumef87d6e2007-05-02 19:09:54 +000042 type2test = str
Walter Dörwald0fd583c2003-02-21 12:53:50 +000043
44 def checkequalnofix(self, result, object, methodname, *args):
45 method = getattr(object, methodname)
46 realresult = method(*args)
47 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000048 self.assertTrue(type(realresult) is type(result))
Walter Dörwald0fd583c2003-02-21 12:53:50 +000049
50 # if the original is returned make sure that
51 # this doesn't happen with subclasses
52 if realresult is object:
Guido van Rossumef87d6e2007-05-02 19:09:54 +000053 class usub(str):
Walter Dörwald0fd583c2003-02-21 12:53:50 +000054 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000055 return 'usub(%r)' % str.__repr__(self)
Walter Dörwald0fd583c2003-02-21 12:53:50 +000056 object = usub(object)
57 method = getattr(object, methodname)
58 realresult = method(*args)
59 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000060 self.assertTrue(object is not realresult)
Guido van Rossume4874ae2001-09-21 15:36:41 +000061
Jeremy Hylton504de6b2003-10-06 05:08:26 +000062 def test_literals(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000063 self.assertEqual('\xff', '\u00ff')
64 self.assertEqual('\uffff', '\U0000ffff')
Guido van Rossum36e0a922007-07-20 04:05:57 +000065 self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
66 self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
67 self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
Benjamin Petersoncd76c272008-04-05 15:09:30 +000068 # raw strings should not have unicode escapes
Florent Xiclunaa87b3832010-09-13 02:28:18 +000069 self.assertNotEqual(r"\u0020", " ")
Jeremy Hylton504de6b2003-10-06 05:08:26 +000070
Georg Brandl559e5d72008-06-11 18:37:52 +000071 def test_ascii(self):
72 if not sys.platform.startswith('java'):
73 # Test basic sanity of repr()
74 self.assertEqual(ascii('abc'), "'abc'")
75 self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
76 self.assertEqual(ascii('ab\\'), "'ab\\\\'")
77 self.assertEqual(ascii('\\c'), "'\\\\c'")
78 self.assertEqual(ascii('\\'), "'\\\\'")
79 self.assertEqual(ascii('\n'), "'\\n'")
80 self.assertEqual(ascii('\r'), "'\\r'")
81 self.assertEqual(ascii('\t'), "'\\t'")
82 self.assertEqual(ascii('\b'), "'\\x08'")
83 self.assertEqual(ascii("'\""), """'\\'"'""")
84 self.assertEqual(ascii("'\""), """'\\'"'""")
85 self.assertEqual(ascii("'"), '''"'"''')
86 self.assertEqual(ascii('"'), """'"'""")
87 latin1repr = (
88 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
89 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
90 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
91 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
92 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
93 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
94 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
95 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
96 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
97 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
98 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
99 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
100 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
101 "\\xfe\\xff'")
102 testrepr = ascii(''.join(map(chr, range(256))))
103 self.assertEqual(testrepr, latin1repr)
104 # Test ascii works on wide unicode escapes without overflow.
105 self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
106 ascii("\U00010000" * 39 + "\uffff" * 4096))
107
108 class WrongRepr:
109 def __repr__(self):
110 return b'byte-repr'
111 self.assertRaises(TypeError, ascii, WrongRepr())
112
Walter Dörwald28256f22003-01-19 16:59:20 +0000113 def test_repr(self):
114 if not sys.platform.startswith('java'):
115 # Test basic sanity of repr()
Walter Dörwald67e83882007-05-05 12:26:27 +0000116 self.assertEqual(repr('abc'), "'abc'")
117 self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
118 self.assertEqual(repr('ab\\'), "'ab\\\\'")
119 self.assertEqual(repr('\\c'), "'\\\\c'")
120 self.assertEqual(repr('\\'), "'\\\\'")
121 self.assertEqual(repr('\n'), "'\\n'")
122 self.assertEqual(repr('\r'), "'\\r'")
123 self.assertEqual(repr('\t'), "'\\t'")
124 self.assertEqual(repr('\b'), "'\\x08'")
125 self.assertEqual(repr("'\""), """'\\'"'""")
126 self.assertEqual(repr("'\""), """'\\'"'""")
127 self.assertEqual(repr("'"), '''"'"''')
128 self.assertEqual(repr('"'), """'"'""")
Walter Dörwald28256f22003-01-19 16:59:20 +0000129 latin1repr = (
Walter Dörwald67e83882007-05-05 12:26:27 +0000130 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
Walter Dörwald28256f22003-01-19 16:59:20 +0000131 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
132 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
133 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
134 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
135 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
Georg Brandl559e5d72008-06-11 18:37:52 +0000136 "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
137 "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
138 "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
139 "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
140 "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
141 "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
142 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
143 "\xfe\xff'")
Guido van Rossum805365e2007-05-07 22:24:25 +0000144 testrepr = repr(''.join(map(chr, range(256))))
Walter Dörwald28256f22003-01-19 16:59:20 +0000145 self.assertEqual(testrepr, latin1repr)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000146 # Test repr works on wide unicode escapes without overflow.
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000147 self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
148 repr("\U00010000" * 39 + "\uffff" * 4096))
Walter Dörwald28256f22003-01-19 16:59:20 +0000149
Georg Brandl559e5d72008-06-11 18:37:52 +0000150 class WrongRepr:
151 def __repr__(self):
152 return b'byte-repr'
153 self.assertRaises(TypeError, repr, WrongRepr())
154
Guido van Rossum49d6b072006-08-17 21:11:47 +0000155 def test_iterators(self):
156 # Make sure unicode objects have an __iter__ method
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000157 it = "\u1111\u2222\u3333".__iter__()
158 self.assertEqual(next(it), "\u1111")
159 self.assertEqual(next(it), "\u2222")
160 self.assertEqual(next(it), "\u3333")
Georg Brandla18af4e2007-04-21 15:47:16 +0000161 self.assertRaises(StopIteration, next, it)
Guido van Rossum49d6b072006-08-17 21:11:47 +0000162
Walter Dörwald28256f22003-01-19 16:59:20 +0000163 def test_count(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000164 string_tests.CommonTest.test_count(self)
165 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000166 self.checkequalnofix(3, 'aaa', 'count', 'a')
167 self.checkequalnofix(0, 'aaa', 'count', 'b')
168 self.checkequalnofix(3, 'aaa', 'count', 'a')
169 self.checkequalnofix(0, 'aaa', 'count', 'b')
170 self.checkequalnofix(0, 'aaa', 'count', 'b')
171 self.checkequalnofix(1, 'aaa', 'count', 'a', -1)
172 self.checkequalnofix(3, 'aaa', 'count', 'a', -10)
173 self.checkequalnofix(2, 'aaa', 'count', 'a', 0, -1)
174 self.checkequalnofix(0, 'aaa', 'count', 'a', 0, -10)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000175
Walter Dörwald28256f22003-01-19 16:59:20 +0000176 def test_find(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000177 self.checkequalnofix(0, 'abcdefghiabc', 'find', 'abc')
178 self.checkequalnofix(9, 'abcdefghiabc', 'find', 'abc', 1)
179 self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000180
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000181 self.assertRaises(TypeError, 'hello'.find)
182 self.assertRaises(TypeError, 'hello'.find, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000183
Walter Dörwald28256f22003-01-19 16:59:20 +0000184 def test_rfind(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000185 string_tests.CommonTest.test_rfind(self)
186 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000187 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', 'abc')
188 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
189 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
Guido van Rossum8b264542000-12-19 02:22:31 +0000190
Walter Dörwald28256f22003-01-19 16:59:20 +0000191 def test_index(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000192 string_tests.CommonTest.test_index(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000193 self.checkequalnofix(0, 'abcdefghiabc', 'index', '')
194 self.checkequalnofix(3, 'abcdefghiabc', 'index', 'def')
195 self.checkequalnofix(0, 'abcdefghiabc', 'index', 'abc')
196 self.checkequalnofix(9, 'abcdefghiabc', 'index', 'abc', 1)
197 self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
198 self.assertRaises(ValueError, 'abcdefghiab'.index, 'abc', 1)
199 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', 8)
200 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000201
Walter Dörwald28256f22003-01-19 16:59:20 +0000202 def test_rindex(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000203 string_tests.CommonTest.test_rindex(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000204 self.checkequalnofix(12, 'abcdefghiabc', 'rindex', '')
205 self.checkequalnofix(3, 'abcdefghiabc', 'rindex', 'def')
206 self.checkequalnofix(9, 'abcdefghiabc', 'rindex', 'abc')
207 self.checkequalnofix(0, 'abcdefghiabc', 'rindex', 'abc', 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000208
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000209 self.assertRaises(ValueError, 'abcdefghiabc'.rindex, 'hib')
210 self.assertRaises(ValueError, 'defghiabc'.rindex, 'def', 1)
211 self.assertRaises(ValueError, 'defghiabc'.rindex, 'abc', 0, -1)
212 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, 8)
213 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000214
Georg Brandlceee0772007-11-27 23:48:05 +0000215 def test_maketrans_translate(self):
216 # these work with plain translate()
217 self.checkequalnofix('bbbc', 'abababc', 'translate',
218 {ord('a'): None})
219 self.checkequalnofix('iiic', 'abababc', 'translate',
220 {ord('a'): None, ord('b'): ord('i')})
221 self.checkequalnofix('iiix', 'abababc', 'translate',
222 {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
223 self.checkequalnofix('c', 'abababc', 'translate',
224 {ord('a'): None, ord('b'): ''})
225 self.checkequalnofix('xyyx', 'xzx', 'translate',
226 {ord('z'): 'yy'})
227 # this needs maketrans()
228 self.checkequalnofix('abababc', 'abababc', 'translate',
229 {'b': '<i>'})
230 tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
231 self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
232 # test alternative way of calling maketrans()
233 tbl = self.type2test.maketrans('abc', 'xyz', 'd')
234 self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
235
236 self.assertRaises(TypeError, self.type2test.maketrans)
237 self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
238 self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
239 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
240 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
241 self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
242 self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000243
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000244 self.assertRaises(TypeError, 'hello'.translate)
Walter Dörwald67e83882007-05-05 12:26:27 +0000245 self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000246
Walter Dörwald28256f22003-01-19 16:59:20 +0000247 def test_split(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000248 string_tests.CommonTest.test_split(self)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000249
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000250 # Mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000251 self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
252 self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
253 self.checkequalnofix(['endcase ', ''], 'endcase test', 'split', 'test')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000254
Walter Dörwald28256f22003-01-19 16:59:20 +0000255 def test_join(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000256 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000257
Guido van Rossumf1044292007-09-27 18:01:22 +0000258 class MyWrapper:
259 def __init__(self, sval): self.sval = sval
260 def __str__(self): return self.sval
261
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000262 # mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000263 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
264 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
265 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
266 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
267 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
268 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
269 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
Guido van Rossum98297ee2007-11-06 21:34:58 +0000270 self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
271 self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
272 self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
273 self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
Marc-André Lemburge5034372000-08-08 08:04:29 +0000274
Walter Dörwald28256f22003-01-19 16:59:20 +0000275 def test_replace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000276 string_tests.CommonTest.test_replace(self)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000277
Walter Dörwald28256f22003-01-19 16:59:20 +0000278 # method call forwarded from str implementation because of unicode argument
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000279 self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
280 self.assertRaises(TypeError, 'replace'.replace, "r", 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000281
Guido van Rossum98297ee2007-11-06 21:34:58 +0000282 def test_bytes_comparison(self):
Brett Cannon226b2302010-03-20 22:22:22 +0000283 with support.check_warnings():
284 warnings.simplefilter('ignore', BytesWarning)
285 self.assertEqual('abc' == b'abc', False)
286 self.assertEqual('abc' != b'abc', True)
287 self.assertEqual('abc' == bytearray(b'abc'), False)
288 self.assertEqual('abc' != bytearray(b'abc'), True)
Brett Cannon40430012007-10-22 20:24:51 +0000289
Walter Dörwald28256f22003-01-19 16:59:20 +0000290 def test_comparison(self):
291 # Comparisons:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000292 self.assertEqual('abc', 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000293 self.assertTrue('abcd' > 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000294 self.assertTrue('abc' < 'abcd')
Walter Dörwald28256f22003-01-19 16:59:20 +0000295
296 if 0:
297 # Move these tests to a Unicode collation module test...
298 # Testing UTF-16 code point order comparisons...
299
300 # No surrogates, no fixup required.
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000301 self.assertTrue('\u0061' < '\u20ac')
Walter Dörwald28256f22003-01-19 16:59:20 +0000302 # Non surrogate below surrogate value, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000303 self.assertTrue('\u0061' < '\ud800\udc02')
Walter Dörwald28256f22003-01-19 16:59:20 +0000304
305 # Non surrogate above surrogate value, fixup required
306 def test_lecmp(s, s2):
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000307 self.assertTrue(s < s2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000308
309 def test_fixup(s):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000310 s2 = '\ud800\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000311 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000312 s2 = '\ud900\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000313 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000314 s2 = '\uda00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000315 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000316 s2 = '\udb00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000317 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000318 s2 = '\ud800\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000319 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000320 s2 = '\ud900\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000321 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000322 s2 = '\uda00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000323 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000324 s2 = '\udb00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000325 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000326 s2 = '\ud800\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000327 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000328 s2 = '\ud900\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000329 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000330 s2 = '\uda00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000331 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000332 s2 = '\udb00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000333 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000334 s2 = '\ud800\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000335 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000336 s2 = '\ud900\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000337 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000338 s2 = '\uda00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000339 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000340 s2 = '\udb00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000341 test_lecmp(s, s2)
342
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000343 test_fixup('\ue000')
344 test_fixup('\uff61')
Walter Dörwald28256f22003-01-19 16:59:20 +0000345
346 # Surrogates on both sides, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000347 self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
Walter Dörwald28256f22003-01-19 16:59:20 +0000348
Walter Dörwald28256f22003-01-19 16:59:20 +0000349 def test_islower(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000350 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000351 self.checkequalnofix(False, '\u1FFc', 'islower')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300352 # non-BMP, uppercase
353 self.assertFalse('\U00010401'.islower())
354 self.assertFalse('\U00010427'.islower())
355 # non-BMP, lowercase
356 self.assertTrue('\U00010429'.islower())
357 self.assertTrue('\U0001044E'.islower())
358 # non-BMP, non-cased
359 self.assertFalse('\U0001F40D'.islower())
360 self.assertFalse('\U0001F46F'.islower())
Walter Dörwald28256f22003-01-19 16:59:20 +0000361
362 def test_isupper(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000363 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
364 if not sys.platform.startswith('java'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000365 self.checkequalnofix(False, '\u1FFc', 'isupper')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300366 # non-BMP, uppercase
367 self.assertTrue('\U00010401'.isupper())
368 self.assertTrue('\U00010427'.isupper())
369 # non-BMP, lowercase
370 self.assertFalse('\U00010429'.isupper())
371 self.assertFalse('\U0001044E'.isupper())
372 # non-BMP, non-cased
373 self.assertFalse('\U0001F40D'.isupper())
374 self.assertFalse('\U0001F46F'.isupper())
Walter Dörwald28256f22003-01-19 16:59:20 +0000375
376 def test_istitle(self):
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300377 string_tests.MixinStrUnicodeUserStringTest.test_istitle(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000378 self.checkequalnofix(True, '\u1FFc', 'istitle')
379 self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
Walter Dörwald28256f22003-01-19 16:59:20 +0000380
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300381 # non-BMP, uppercase + lowercase
382 self.assertTrue('\U00010401\U00010429'.istitle())
383 self.assertTrue('\U00010427\U0001044E'.istitle())
384 # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
385 for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
386 self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
387
Walter Dörwald28256f22003-01-19 16:59:20 +0000388 def test_isspace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000389 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000390 self.checkequalnofix(True, '\u2000', 'isspace')
391 self.checkequalnofix(True, '\u200a', 'isspace')
392 self.checkequalnofix(False, '\u2014', 'isspace')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300393 # apparently there are no non-BMP spaces chars in Unicode 6
394 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
395 '\U0001F40D', '\U0001F46F']:
396 self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
397
398 def test_isalnum(self):
399 string_tests.MixinStrUnicodeUserStringTest.test_isalnum(self)
400 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
401 '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
402 self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
Walter Dörwald28256f22003-01-19 16:59:20 +0000403
404 def test_isalpha(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000405 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000406 self.checkequalnofix(True, '\u1FFc', 'isalpha')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300407 # non-BMP, cased
408 self.assertTrue('\U00010401'.isalpha())
409 self.assertTrue('\U00010427'.isalpha())
410 self.assertTrue('\U00010429'.isalpha())
411 self.assertTrue('\U0001044E'.isalpha())
412 # non-BMP, non-cased
413 self.assertFalse('\U0001F40D'.isalpha())
414 self.assertFalse('\U0001F46F'.isalpha())
Walter Dörwald28256f22003-01-19 16:59:20 +0000415
416 def test_isdecimal(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000417 self.checkequalnofix(False, '', 'isdecimal')
418 self.checkequalnofix(False, 'a', 'isdecimal')
419 self.checkequalnofix(True, '0', 'isdecimal')
420 self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
421 self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
422 self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
423 self.checkequalnofix(True, '0123456789', 'isdecimal')
424 self.checkequalnofix(False, '0123456789a', 'isdecimal')
Walter Dörwald28256f22003-01-19 16:59:20 +0000425
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000426 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000427
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300428 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
429 '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
430 self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
431 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
432 self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
433
Walter Dörwald28256f22003-01-19 16:59:20 +0000434 def test_isdigit(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000435 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000436 self.checkequalnofix(True, '\u2460', 'isdigit')
437 self.checkequalnofix(False, '\xbc', 'isdigit')
438 self.checkequalnofix(True, '\u0660', 'isdigit')
Walter Dörwald28256f22003-01-19 16:59:20 +0000439
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300440 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
441 '\U0001F40D', '\U0001F46F', '\U00011065']:
442 self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
443 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
444 self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
445
Walter Dörwald28256f22003-01-19 16:59:20 +0000446 def test_isnumeric(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000447 self.checkequalnofix(False, '', 'isnumeric')
448 self.checkequalnofix(False, 'a', 'isnumeric')
449 self.checkequalnofix(True, '0', 'isnumeric')
450 self.checkequalnofix(True, '\u2460', 'isnumeric')
451 self.checkequalnofix(True, '\xbc', 'isnumeric')
452 self.checkequalnofix(True, '\u0660', 'isnumeric')
453 self.checkequalnofix(True, '0123456789', 'isnumeric')
454 self.checkequalnofix(False, '0123456789a', 'isnumeric')
Walter Dörwald28256f22003-01-19 16:59:20 +0000455
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000456 self.assertRaises(TypeError, "abc".isnumeric, 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000457
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300458 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
459 '\U0001F40D', '\U0001F46F']:
460 self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
461 for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
462 '\U000104A0', '\U0001F107']:
463 self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
464
Martin v. Löwis47383402007-08-15 07:32:56 +0000465 def test_isidentifier(self):
466 self.assertTrue("a".isidentifier())
467 self.assertTrue("Z".isidentifier())
468 self.assertTrue("_".isidentifier())
469 self.assertTrue("b0".isidentifier())
470 self.assertTrue("bc".isidentifier())
471 self.assertTrue("b_".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000472 self.assertTrue("µ".isidentifier())
Benjamin Petersonf413b802011-08-12 22:17:18 -0500473 self.assertTrue("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000474
475 self.assertFalse(" ".isidentifier())
476 self.assertFalse("[".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000477 self.assertFalse("©".isidentifier())
Georg Brandld52429f2008-07-04 15:55:02 +0000478 self.assertFalse("0".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000479
Georg Brandl559e5d72008-06-11 18:37:52 +0000480 def test_isprintable(self):
481 self.assertTrue("".isprintable())
Benjamin Peterson09832742009-03-26 17:15:46 +0000482 self.assertTrue(" ".isprintable())
Georg Brandl559e5d72008-06-11 18:37:52 +0000483 self.assertTrue("abcdefg".isprintable())
484 self.assertFalse("abcdefg\n".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000485 # some defined Unicode character
486 self.assertTrue("\u0374".isprintable())
487 # undefined character
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +0000488 self.assertFalse("\u0378".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000489 # single surrogate character
Georg Brandl559e5d72008-06-11 18:37:52 +0000490 self.assertFalse("\ud800".isprintable())
491
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300492 self.assertTrue('\U0001F46F'.isprintable())
493 self.assertFalse('\U000E0020'.isprintable())
494
495 def test_surrogates(self):
496 for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
497 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
498 self.assertTrue(s.islower())
499 self.assertFalse(s.isupper())
500 self.assertFalse(s.istitle())
501 for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
502 'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
503 self.assertFalse(s.islower())
504 self.assertTrue(s.isupper())
505 self.assertTrue(s.istitle())
506
507 for meth_name in ('islower', 'isupper', 'istitle'):
508 meth = getattr(str, meth_name)
509 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
510 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
511
512 for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
513 'isdecimal', 'isnumeric',
514 'isidentifier', 'isprintable'):
515 meth = getattr(str, meth_name)
516 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
517 'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
518 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
519 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
520
521
Ezio Melottia5c92b42011-08-23 00:37:08 +0300522 @requires_wide_build
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300523 def test_lower(self):
524 string_tests.CommonTest.test_lower(self)
525 self.assertEqual('\U00010427'.lower(), '\U0001044F')
526 self.assertEqual('\U00010427\U00010427'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300527 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300528 self.assertEqual('\U00010427\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300529 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300530 self.assertEqual('X\U00010427x\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300531 'x\U0001044Fx\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300532
Ezio Melottia5c92b42011-08-23 00:37:08 +0300533 @requires_wide_build
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300534 def test_upper(self):
535 string_tests.CommonTest.test_upper(self)
536 self.assertEqual('\U0001044F'.upper(), '\U00010427')
537 self.assertEqual('\U0001044F\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300538 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300539 self.assertEqual('\U00010427\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300540 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300541 self.assertEqual('X\U00010427x\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300542 'X\U00010427X\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300543
Ezio Melottia5c92b42011-08-23 00:37:08 +0300544 @requires_wide_build
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300545 def test_capitalize(self):
546 string_tests.CommonTest.test_capitalize(self)
547 self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
548 self.assertEqual('\U0001044F\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300549 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300550 self.assertEqual('\U00010427\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300551 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300552 self.assertEqual('\U0001044F\U00010427'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300553 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300554 self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300555 'X\U0001044Fx\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300556
Ezio Melottia5c92b42011-08-23 00:37:08 +0300557 @requires_wide_build
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300558 def test_title(self):
559 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
560 self.assertEqual('\U0001044F'.title(), '\U00010427')
561 self.assertEqual('\U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300562 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300563 self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300564 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300565 self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300566 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300567 self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300568 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300569 self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300570 'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300571
Ezio Melottia5c92b42011-08-23 00:37:08 +0300572 @requires_wide_build
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300573 def test_swapcase(self):
574 string_tests.CommonTest.test_swapcase(self)
575 self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
576 self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
577 self.assertEqual('\U0001044F\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300578 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300579 self.assertEqual('\U00010427\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300580 '\U0001044F\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300581 self.assertEqual('\U0001044F\U00010427'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300582 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300583 self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300584 'x\U0001044FX\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300585
Walter Dörwald28256f22003-01-19 16:59:20 +0000586 def test_contains(self):
587 # Testing Unicode contains method
Benjamin Peterson577473f2010-01-19 00:09:57 +0000588 self.assertIn('a', 'abdb')
589 self.assertIn('a', 'bdab')
590 self.assertIn('a', 'bdaba')
591 self.assertIn('a', 'bdba')
592 self.assertNotIn('a', 'bdb')
593 self.assertIn('a', 'bdba')
594 self.assertIn('a', ('a',1,None))
595 self.assertIn('a', (1,None,'a'))
596 self.assertIn('a', ('a',1,None))
597 self.assertIn('a', (1,None,'a'))
598 self.assertNotIn('a', ('x',1,'y'))
599 self.assertNotIn('a', ('x',1,None))
600 self.assertNotIn('abcd', 'abcxxxx')
601 self.assertIn('ab', 'abcd')
602 self.assertIn('ab', 'abc')
603 self.assertIn('ab', (1,None,'ab'))
604 self.assertIn('', 'abc')
605 self.assertIn('', '')
606 self.assertIn('', 'abc')
607 self.assertNotIn('\0', 'abc')
608 self.assertIn('\0', '\0abc')
609 self.assertIn('\0', 'abc\0')
610 self.assertIn('a', '\0abc')
611 self.assertIn('asdf', 'asdf')
612 self.assertNotIn('asdf', 'asd')
613 self.assertNotIn('asdf', '')
Walter Dörwald28256f22003-01-19 16:59:20 +0000614
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000615 self.assertRaises(TypeError, "abc".__contains__)
Walter Dörwald28256f22003-01-19 16:59:20 +0000616
Eric Smith8c663262007-08-25 02:26:07 +0000617 def test_format(self):
618 self.assertEqual(''.format(), '')
619 self.assertEqual('a'.format(), 'a')
620 self.assertEqual('ab'.format(), 'ab')
621 self.assertEqual('a{{'.format(), 'a{')
622 self.assertEqual('a}}'.format(), 'a}')
623 self.assertEqual('{{b'.format(), '{b')
624 self.assertEqual('}}b'.format(), '}b')
625 self.assertEqual('a{{b'.format(), 'a{b')
626
627 # examples from the PEP:
628 import datetime
629 self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
630 self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
631 "My name is Fred")
632 self.assertEqual("My name is {0} :-{{}}".format('Fred'),
633 "My name is Fred :-{}")
634
635 d = datetime.date(2007, 8, 18)
636 self.assertEqual("The year is {0.year}".format(d),
637 "The year is 2007")
638
Eric Smith8c663262007-08-25 02:26:07 +0000639 # classes we'll use for testing
640 class C:
641 def __init__(self, x=100):
642 self._x = x
643 def __format__(self, spec):
644 return spec
645
646 class D:
647 def __init__(self, x):
648 self.x = x
649 def __format__(self, spec):
650 return str(self.x)
651
652 # class with __str__, but no __format__
653 class E:
654 def __init__(self, x):
655 self.x = x
656 def __str__(self):
657 return 'E(' + self.x + ')'
658
659 # class with __repr__, but no __format__ or __str__
660 class F:
661 def __init__(self, x):
662 self.x = x
663 def __repr__(self):
664 return 'F(' + self.x + ')'
665
666 # class with __format__ that forwards to string, for some format_spec's
667 class G:
668 def __init__(self, x):
669 self.x = x
670 def __str__(self):
671 return "string is " + self.x
672 def __format__(self, format_spec):
673 if format_spec == 'd':
674 return 'G(' + self.x + ')'
675 return object.__format__(self, format_spec)
676
Eric Smith739e2ad2007-08-27 19:07:22 +0000677 class I(datetime.date):
678 def __format__(self, format_spec):
679 return self.strftime(format_spec)
680
Eric Smith185e30c2007-08-30 22:23:08 +0000681 class J(int):
682 def __format__(self, format_spec):
683 return int.__format__(self * 2, format_spec)
684
Eric Smith8c663262007-08-25 02:26:07 +0000685
686 self.assertEqual(''.format(), '')
687 self.assertEqual('abc'.format(), 'abc')
688 self.assertEqual('{0}'.format('abc'), 'abc')
689 self.assertEqual('{0:}'.format('abc'), 'abc')
690# self.assertEqual('{ 0 }'.format('abc'), 'abc')
691 self.assertEqual('X{0}'.format('abc'), 'Xabc')
692 self.assertEqual('{0}X'.format('abc'), 'abcX')
693 self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
694 self.assertEqual('{1}'.format(1, 'abc'), 'abc')
695 self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
696 self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
697 self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
698 self.assertEqual('{0}'.format(-15), '-15')
699 self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
700 self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
701 self.assertEqual('{{'.format(), '{')
702 self.assertEqual('}}'.format(), '}')
703 self.assertEqual('{{}}'.format(), '{}')
704 self.assertEqual('{{x}}'.format(), '{x}')
705 self.assertEqual('{{{0}}}'.format(123), '{123}')
706 self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
707 self.assertEqual('}}{{'.format(), '}{')
708 self.assertEqual('}}x{{'.format(), '}x{')
709
Eric Smith7ade6482007-08-26 22:27:13 +0000710 # weird field names
711 self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
712 self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
Eric Smith4cb4e4e2007-09-03 08:40:29 +0000713 self.assertEqual("{0[ ]}".format({' ':3}), '3')
Eric Smith7ade6482007-08-26 22:27:13 +0000714
Eric Smith8c663262007-08-25 02:26:07 +0000715 self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
716 self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
717 self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
718 self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
719 self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
720 self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
721 self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
722
Eric Smith8c663262007-08-25 02:26:07 +0000723 # strings
724 self.assertEqual('{0:.3s}'.format('abc'), 'abc')
725 self.assertEqual('{0:.3s}'.format('ab'), 'ab')
726 self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
727 self.assertEqual('{0:.0s}'.format('abcdef'), '')
728 self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
729 self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
730 self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
731 self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
732 self.assertEqual('{0:x<0s}'.format('result'), 'result')
733 self.assertEqual('{0:x<5s}'.format('result'), 'result')
734 self.assertEqual('{0:x<6s}'.format('result'), 'result')
735 self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
736 self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
737 self.assertEqual('{0: <7s}'.format('result'), 'result ')
738 self.assertEqual('{0:<7s}'.format('result'), 'result ')
739 self.assertEqual('{0:>7s}'.format('result'), ' result')
740 self.assertEqual('{0:>8s}'.format('result'), ' result')
741 self.assertEqual('{0:^8s}'.format('result'), ' result ')
742 self.assertEqual('{0:^9s}'.format('result'), ' result ')
743 self.assertEqual('{0:^10s}'.format('result'), ' result ')
744 self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
745 self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
746 self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
747
748 # format specifiers for user defined type
749 self.assertEqual('{0:abc}'.format(C()), 'abc')
750
Georg Brandld52429f2008-07-04 15:55:02 +0000751 # !r, !s and !a coercions
Eric Smith8c663262007-08-25 02:26:07 +0000752 self.assertEqual('{0!s}'.format('Hello'), 'Hello')
753 self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
754 self.assertEqual('{0!s:15}'.format('Hello'), 'Hello ')
755 self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello ')
756 self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
757 self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
758 self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +0000759 self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +0000760 self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'") # printable
761 self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
Georg Brandl559e5d72008-06-11 18:37:52 +0000762 self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +0000763 self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +0000764 self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
Georg Brandl559e5d72008-06-11 18:37:52 +0000765 self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
766 self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
Georg Brandld52429f2008-07-04 15:55:02 +0000767 self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
Eric Smith8c663262007-08-25 02:26:07 +0000768
Eric Smith8c663262007-08-25 02:26:07 +0000769 # test fallback to object.__format__
770 self.assertEqual('{0}'.format({}), '{}')
771 self.assertEqual('{0}'.format([]), '[]')
772 self.assertEqual('{0}'.format([1]), '[1]')
Eric Smithe4d63172010-09-13 20:48:43 +0000773
Eric Smith8c663262007-08-25 02:26:07 +0000774 self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
Eric Smith8c663262007-08-25 02:26:07 +0000775 self.assertEqual('{0!s}'.format(G('data')), 'string is data')
776
Eric Smithe4d63172010-09-13 20:48:43 +0000777 msg = 'object.__format__ with a non-empty format string is deprecated'
Eric V. Smithb9cd3532011-03-12 10:08:48 -0500778 with support.check_warnings((msg, DeprecationWarning)):
Eric Smithe4d63172010-09-13 20:48:43 +0000779 self.assertEqual('{0:^10}'.format(E('data')), ' E(data) ')
780 self.assertEqual('{0:^10s}'.format(E('data')), ' E(data) ')
781 self.assertEqual('{0:>15s}'.format(G('data')), ' string is data')
782
Eric Smith739e2ad2007-08-27 19:07:22 +0000783 self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
784 month=8,
785 day=27)),
786 "date: 2007-08-27")
787
Eric Smith185e30c2007-08-30 22:23:08 +0000788 # test deriving from a builtin type and overriding __format__
789 self.assertEqual("{0}".format(J(10)), "20")
790
791
Eric Smith8c663262007-08-25 02:26:07 +0000792 # string format specifiers
793 self.assertEqual('{0:}'.format('a'), 'a')
794
795 # computed format specifiers
796 self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
797 self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
798 self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
799 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello ')
800 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello ')
801
802 # test various errors
803 self.assertRaises(ValueError, '{'.format)
804 self.assertRaises(ValueError, '}'.format)
805 self.assertRaises(ValueError, 'a{'.format)
806 self.assertRaises(ValueError, 'a}'.format)
807 self.assertRaises(ValueError, '{a'.format)
808 self.assertRaises(ValueError, '}a'.format)
Eric Smith11529192007-09-04 23:04:22 +0000809 self.assertRaises(IndexError, '{0}'.format)
810 self.assertRaises(IndexError, '{1}'.format, 'abc')
811 self.assertRaises(KeyError, '{x}'.format)
Eric Smith8c663262007-08-25 02:26:07 +0000812 self.assertRaises(ValueError, "}{".format)
Eric Smith8c663262007-08-25 02:26:07 +0000813 self.assertRaises(ValueError, "abc{0:{}".format)
814 self.assertRaises(ValueError, "{0".format)
Eric Smith11529192007-09-04 23:04:22 +0000815 self.assertRaises(IndexError, "{0.}".format)
816 self.assertRaises(ValueError, "{0.}".format, 0)
817 self.assertRaises(IndexError, "{0[}".format)
Eric Smith4cb4e4e2007-09-03 08:40:29 +0000818 self.assertRaises(ValueError, "{0[}".format, [])
Eric Smith11529192007-09-04 23:04:22 +0000819 self.assertRaises(KeyError, "{0]}".format)
820 self.assertRaises(ValueError, "{0.[]}".format, 0)
Eric Smith7ade6482007-08-26 22:27:13 +0000821 self.assertRaises(ValueError, "{0..foo}".format, 0)
Eric Smith11529192007-09-04 23:04:22 +0000822 self.assertRaises(ValueError, "{0[0}".format, 0)
823 self.assertRaises(ValueError, "{0[0:foo}".format, 0)
824 self.assertRaises(KeyError, "{c]}".format)
825 self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
826 self.assertRaises(ValueError, "{0}}".format, 0)
827 self.assertRaises(KeyError, "{foo}".format, bar=3)
Eric Smith8c663262007-08-25 02:26:07 +0000828 self.assertRaises(ValueError, "{0!x}".format, 3)
Eric Smith11529192007-09-04 23:04:22 +0000829 self.assertRaises(ValueError, "{0!}".format, 0)
830 self.assertRaises(ValueError, "{0!rs}".format, 0)
Eric Smith8c663262007-08-25 02:26:07 +0000831 self.assertRaises(ValueError, "{!}".format)
Eric Smith8ec90442009-03-14 12:29:34 +0000832 self.assertRaises(IndexError, "{:}".format)
833 self.assertRaises(IndexError, "{:s}".format)
834 self.assertRaises(IndexError, "{}".format)
Benjamin Peterson59a1b2f2010-06-07 22:31:26 +0000835 big = "23098475029384702983476098230754973209482573"
836 self.assertRaises(ValueError, ("{" + big + "}").format)
837 self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
Eric Smith8c663262007-08-25 02:26:07 +0000838
Eric Smith41669ca2009-05-23 14:23:22 +0000839 # issue 6089
840 self.assertRaises(ValueError, "{0[0]x}".format, [None])
841 self.assertRaises(ValueError, "{0[0](10)}".format, [None])
842
Eric Smith8c663262007-08-25 02:26:07 +0000843 # can't have a replacement on the field name portion
844 self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
845
846 # exceed maximum recursion depth
847 self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
848 self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
849 0, 1, 2, 3, 4, 5, 6, 7)
850
851 # string format spec errors
852 self.assertRaises(ValueError, "{0:-s}".format, '')
853 self.assertRaises(ValueError, format, "", "-")
854 self.assertRaises(ValueError, "{0:=s}".format, '')
855
Eric Smithb1ebcc62008-07-15 13:02:41 +0000856 # Alternate formatting is not supported
857 self.assertRaises(ValueError, format, '', '#')
858 self.assertRaises(ValueError, format, '', '#20')
859
Eric Smith27bbca62010-11-04 17:06:58 +0000860 def test_format_map(self):
861 self.assertEqual(''.format_map({}), '')
862 self.assertEqual('a'.format_map({}), 'a')
863 self.assertEqual('ab'.format_map({}), 'ab')
864 self.assertEqual('a{{'.format_map({}), 'a{')
865 self.assertEqual('a}}'.format_map({}), 'a}')
866 self.assertEqual('{{b'.format_map({}), '{b')
867 self.assertEqual('}}b'.format_map({}), '}b')
868 self.assertEqual('a{{b'.format_map({}), 'a{b')
869
870 # using mappings
871 class Mapping(dict):
872 def __missing__(self, key):
873 return key
874 self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
875 self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
876
877 class InternalMapping:
878 def __init__(self):
879 self.mapping = {'a': 'hello'}
880 def __getitem__(self, key):
881 return self.mapping[key]
882 self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
883
884
Eric Smith27bbca62010-11-04 17:06:58 +0000885 class C:
886 def __init__(self, x=100):
887 self._x = x
888 def __format__(self, spec):
889 return spec
Eric Smith27bbca62010-11-04 17:06:58 +0000890 self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
891
892 # test various errors
893 self.assertRaises(TypeError, '{'.format_map)
894 self.assertRaises(TypeError, '}'.format_map)
895 self.assertRaises(TypeError, 'a{'.format_map)
896 self.assertRaises(TypeError, 'a}'.format_map)
897 self.assertRaises(TypeError, '{a'.format_map)
898 self.assertRaises(TypeError, '}a'.format_map)
899
Eric V. Smith12ebefc2011-07-18 14:03:41 -0400900 # issue #12579: can't supply positional params to format_map
901 self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
902 self.assertRaises(ValueError, '{}'.format_map, 'a')
903 self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
904
Eric Smith8ec90442009-03-14 12:29:34 +0000905 def test_format_auto_numbering(self):
906 class C:
907 def __init__(self, x=100):
908 self._x = x
909 def __format__(self, spec):
910 return spec
911
912 self.assertEqual('{}'.format(10), '10')
913 self.assertEqual('{:5}'.format('s'), 's ')
914 self.assertEqual('{!r}'.format('s'), "'s'")
915 self.assertEqual('{._x}'.format(C(10)), '10')
916 self.assertEqual('{[1]}'.format([1, 2]), '2')
917 self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
918 self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
919
920 self.assertEqual('a{:{}}b'.format('x', '^10'), 'a x b')
921 self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
922
923 # can't mix and match numbering and auto-numbering
924 self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
925 self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
926 self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
927 self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
928
929 # can mix and match auto-numbering and named
930 self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
931 self.assertEqual('{}{f}'.format(4, f='test'), '4test')
932 self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
933 self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
934
Walter Dörwald28256f22003-01-19 16:59:20 +0000935 def test_formatting(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000936 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
Walter Dörwald28256f22003-01-19 16:59:20 +0000937 # Testing Unicode formatting strings...
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000938 self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
939 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000, 3.00')
940 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000, 3.00')
941 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000, 3.50')
942 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000, 3.57')
943 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
Walter Dörwald28256f22003-01-19 16:59:20 +0000944 if not sys.platform.startswith('java'):
Walter Dörwald67e83882007-05-05 12:26:27 +0000945 self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
Georg Brandl559e5d72008-06-11 18:37:52 +0000946 self.assertEqual("%r" % ("\u1234",), "'\u1234'")
947 self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000948 self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
949 self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
Walter Dörwald56fbcb52003-03-31 18:18:41 +0000950
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000951 self.assertEqual('%c' % 0x1234, '\u1234')
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +0000952 self.assertEqual('%c' % 0x21483, '\U00021483')
953 self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
954 self.assertEqual('%c' % '\U00021483', '\U00021483')
955 self.assertRaises(TypeError, "%c".__mod__, "aa")
Stefan Krah99212f62010-07-19 17:58:26 +0000956 self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -0700957 self.assertRaises(TypeError, "%i".__mod__, "aa")
Walter Dörwald28256f22003-01-19 16:59:20 +0000958
959 # formatting jobs delegated from the string implementation:
Walter Dörwald28256f22003-01-19 16:59:20 +0000960 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000961 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
962 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
963 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
964 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
965 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
966 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
967 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
968 self.assertEqual('...%s...' % "abc", '...abc...')
969 self.assertEqual('%*s' % (5,'abc',), ' abc')
970 self.assertEqual('%*s' % (-5,'abc',), 'abc ')
971 self.assertEqual('%*.*s' % (5,2,'abc',), ' ab')
972 self.assertEqual('%*.*s' % (5,3,'abc',), ' abc')
973 self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10 abc')
974 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103 abc')
975 self.assertEqual('%c' % 'a', 'a')
Neil Schemenauercf52c072005-08-12 17:34:58 +0000976 class Wrapper:
977 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000978 return '\u1234'
979 self.assertEqual('%s' % Wrapper(), '\u1234')
Walter Dörwald28256f22003-01-19 16:59:20 +0000980
Eric Smith741191f2009-05-06 13:08:15 +0000981 # issue 3382
982 NAN = float('nan')
983 INF = float('inf')
984 self.assertEqual('%f' % NAN, 'nan')
985 self.assertEqual('%F' % NAN, 'NAN')
986 self.assertEqual('%f' % INF, 'inf')
987 self.assertEqual('%F' % INF, 'INF')
988
Ezio Melottiba42fd52011-04-26 06:09:45 +0300989 def test_startswith_endswith_errors(self):
990 for meth in ('foo'.startswith, 'foo'.endswith):
Ezio Melottif2b3f782011-04-26 06:40:59 +0300991 with self.assertRaises(TypeError) as cm:
Ezio Melottiba42fd52011-04-26 06:09:45 +0300992 meth(['f'])
Ezio Melottif2b3f782011-04-26 06:40:59 +0300993 exc = str(cm.exception)
Ezio Melottiba42fd52011-04-26 06:09:45 +0300994 self.assertIn('str', exc)
995 self.assertIn('tuple', exc)
996
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000997 @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
Georg Brandlda6b1072006-01-20 17:48:54 +0000998 def test_format_float(self):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000999 # should not format with a comma, but always with C locale
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001000 self.assertEqual('1.0', '%.1f' % 1.0)
Georg Brandlda6b1072006-01-20 17:48:54 +00001001
Walter Dörwald28256f22003-01-19 16:59:20 +00001002 def test_constructor(self):
1003 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1004
1005 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001006 str('unicode remains unicode'),
1007 'unicode remains unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001008 )
1009
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001010 class UnicodeSubclass(str):
Marc-André Lemburg79f57832002-12-29 19:44:06 +00001011 pass
Guido van Rossuma831cac2000-03-10 23:23:21 +00001012
Victor Stinner07ac3eb2011-10-01 16:16:43 +02001013 for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
1014 subclass = UnicodeSubclass(text)
1015 self.assertEqual(str(subclass), text)
1016 self.assertEqual(len(subclass), len(text))
1017 if text == 'ascii':
1018 self.assertEqual(subclass.encode('ascii'), b'ascii')
1019 self.assertEqual(subclass.encode('utf-8'), b'ascii')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001020
Walter Dörwald28256f22003-01-19 16:59:20 +00001021 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001022 str('strings are converted to unicode'),
1023 'strings are converted to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001024 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001025
Walter Dörwald28256f22003-01-19 16:59:20 +00001026 class StringCompat:
1027 def __init__(self, x):
1028 self.x = x
1029 def __str__(self):
1030 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001031
Walter Dörwald28256f22003-01-19 16:59:20 +00001032 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001033 str(StringCompat('__str__ compatible objects are recognized')),
1034 '__str__ compatible objects are recognized'
Walter Dörwald28256f22003-01-19 16:59:20 +00001035 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001036
Walter Dörwald28256f22003-01-19 16:59:20 +00001037 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001038
Walter Dörwald28256f22003-01-19 16:59:20 +00001039 o = StringCompat('unicode(obj) is compatible to str()')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001040 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Walter Dörwald28256f22003-01-19 16:59:20 +00001041 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001042
Guido van Rossume2a383d2007-01-15 16:59:06 +00001043 for obj in (123, 123.45, 123):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001044 self.assertEqual(str(obj), str(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001045
Walter Dörwald28256f22003-01-19 16:59:20 +00001046 # unicode(obj, encoding, error) tests (this maps to
1047 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001048
Walter Dörwald28256f22003-01-19 16:59:20 +00001049 if not sys.platform.startswith('java'):
1050 self.assertRaises(
1051 TypeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001052 str,
1053 'decoding unicode is not supported',
Walter Dörwald28256f22003-01-19 16:59:20 +00001054 'utf-8',
1055 'strict'
1056 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001057
Walter Dörwald28256f22003-01-19 16:59:20 +00001058 self.assertEqual(
Walter Dörwald67e83882007-05-05 12:26:27 +00001059 str(b'strings are decoded to unicode', 'utf-8', 'strict'),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001060 'strings are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001061 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001062
Walter Dörwald28256f22003-01-19 16:59:20 +00001063 if not sys.platform.startswith('java'):
1064 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001065 str(
Guido van Rossumbae07c92007-10-08 02:46:15 +00001066 memoryview(b'character buffers are decoded to unicode'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001067 'utf-8',
1068 'strict'
1069 ),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001070 'character buffers are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001071 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001072
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001073 self.assertRaises(TypeError, str, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001074
Walter Dörwald28256f22003-01-19 16:59:20 +00001075 def test_codecs_utf7(self):
1076 utfTests = [
Walter Dörwald67e83882007-05-05 12:26:27 +00001077 ('A\u2262\u0391.', b'A+ImIDkQ.'), # RFC2152 example
1078 ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'), # RFC2152 example
1079 ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'), # RFC2152 example
1080 ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1081 ('+', b'+-'),
1082 ('+-', b'+--'),
1083 ('+?', b'+-?'),
1084 ('\?', b'+AFw?'),
1085 ('+?', b'+-?'),
1086 (r'\\?', b'+AFwAXA?'),
1087 (r'\\\?', b'+AFwAXABc?'),
Antoine Pitrou244651a2009-05-04 18:56:13 +00001088 (r'++--', b'+-+---'),
1089 ('\U000abcde', b'+2m/c3g-'), # surrogate pairs
1090 ('/', b'/'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001091 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001092
Walter Dörwald28256f22003-01-19 16:59:20 +00001093 for (x, y) in utfTests:
1094 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001095
Antoine Pitrou244651a2009-05-04 18:56:13 +00001096 # Unpaired surrogates not supported
Walter Dörwald67e83882007-05-05 12:26:27 +00001097 self.assertRaises(UnicodeError, str, b'+3ADYAA-', 'utf-7')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001098
Antoine Pitrou244651a2009-05-04 18:56:13 +00001099 self.assertEqual(str(b'+3ADYAA-', 'utf-7', 'replace'), '\ufffd\ufffd')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001100
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001101 # Issue #2242: crash on some Windows/MSVC versions
Antoine Pitrou244651a2009-05-04 18:56:13 +00001102 self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1')
1103
1104 # Direct encoded characters
1105 set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1106 # Optional direct characters
1107 set_o = '!"#$%&*;<=>@[]^_`{|}'
1108 for c in set_d:
1109 self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1110 self.assertEqual(c.encode('ascii').decode('utf7'), c)
1111 for c in set_o:
1112 self.assertEqual(c.encode('ascii').decode('utf7'), c)
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001113
Walter Dörwald28256f22003-01-19 16:59:20 +00001114 def test_codecs_utf8(self):
Walter Dörwald67e83882007-05-05 12:26:27 +00001115 self.assertEqual(''.encode('utf-8'), b'')
1116 self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
Martin v. Löwis74b7e442009-06-01 04:23:07 +00001117 if sys.maxunicode == 65535:
1118 self.assertEqual('\ud800\udc02'.encode('utf-8'), b'\xf0\x90\x80\x82')
1119 self.assertEqual('\ud84d\udc56'.encode('utf-8'), b'\xf0\xa3\x91\x96')
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001120 self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1121 self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
Martin v. Löwis74b7e442009-06-01 04:23:07 +00001122 if sys.maxunicode == 65535:
1123 self.assertEqual(
1124 ('\ud800\udc02'*1000).encode('utf-8'),
1125 b'\xf0\x90\x80\x82'*1000)
Walter Dörwald28256f22003-01-19 16:59:20 +00001126 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001127 '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1128 '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1129 '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1130 '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1131 '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1132 ' Nunstuck git und'.encode('utf-8'),
Walter Dörwald67e83882007-05-05 12:26:27 +00001133 b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1134 b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1135 b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1136 b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1137 b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1138 b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1139 b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1140 b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1141 b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1142 b'\xe3\x80\x8cWenn ist das Nunstuck git und'
Walter Dörwald28256f22003-01-19 16:59:20 +00001143 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001144
Walter Dörwald28256f22003-01-19 16:59:20 +00001145 # UTF-8 specific decoding tests
Walter Dörwald67e83882007-05-05 12:26:27 +00001146 self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1147 self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1148 self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001149
Walter Dörwald28256f22003-01-19 16:59:20 +00001150 # Other possible utf-8 test cases:
1151 # * strict decoding testing for all of the
1152 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001153
Ezio Melotti57221d02010-07-01 07:32:02 +00001154 def test_utf8_decode_valid_sequences(self):
1155 sequences = [
1156 # single byte
1157 (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1158 # 2 bytes
1159 (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1160 # 3 bytes
1161 (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1162 (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1163 # 4 bytes
1164 (b'\xF0\x90\x80\x80', '\U00010000'),
1165 (b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1166 ]
1167 for seq, res in sequences:
1168 self.assertEqual(seq.decode('utf-8'), res)
1169
1170
1171 def test_utf8_decode_invalid_sequences(self):
1172 # continuation bytes in a sequence of 2, 3, or 4 bytes
1173 continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
1174 # start bytes of a 2-byte sequence equivalent to codepoints < 0x7F
1175 invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
1176 # start bytes of a 4-byte sequence equivalent to codepoints > 0x10FFFF
1177 invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1178 invalid_start_bytes = (
1179 continuation_bytes + invalid_2B_seq_start_bytes +
1180 invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1181 )
1182
1183 for byte in invalid_start_bytes:
1184 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1185
1186 for sb in invalid_2B_seq_start_bytes:
1187 for cb in continuation_bytes:
1188 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1189
1190 for sb in invalid_4B_seq_start_bytes:
1191 for cb1 in continuation_bytes[:3]:
1192 for cb3 in continuation_bytes[:3]:
1193 self.assertRaises(UnicodeDecodeError,
1194 (sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1195
1196 for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1197 self.assertRaises(UnicodeDecodeError,
1198 (b'\xE0'+cb+b'\x80').decode, 'utf-8')
1199 self.assertRaises(UnicodeDecodeError,
1200 (b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1201 # surrogates
1202 for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1203 self.assertRaises(UnicodeDecodeError,
1204 (b'\xED'+cb+b'\x80').decode, 'utf-8')
1205 self.assertRaises(UnicodeDecodeError,
1206 (b'\xED'+cb+b'\xBF').decode, 'utf-8')
1207 for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1208 self.assertRaises(UnicodeDecodeError,
1209 (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1210 self.assertRaises(UnicodeDecodeError,
1211 (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1212 for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1213 self.assertRaises(UnicodeDecodeError,
1214 (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1215 self.assertRaises(UnicodeDecodeError,
1216 (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1217
1218 def test_issue8271(self):
1219 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1220 # only the start byte and the continuation byte(s) are now considered
1221 # invalid, instead of the number of bytes specified by the start byte.
1222 # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1223 # table 3-8, Row 2) for more information about the algorithm used.
1224 FFFD = '\ufffd'
1225 sequences = [
1226 # invalid start bytes
1227 (b'\x80', FFFD), # continuation byte
1228 (b'\x80\x80', FFFD*2), # 2 continuation bytes
1229 (b'\xc0', FFFD),
1230 (b'\xc0\xc0', FFFD*2),
1231 (b'\xc1', FFFD),
1232 (b'\xc1\xc0', FFFD*2),
1233 (b'\xc0\xc1', FFFD*2),
1234 # with start byte of a 2-byte sequence
1235 (b'\xc2', FFFD), # only the start byte
1236 (b'\xc2\xc2', FFFD*2), # 2 start bytes
1237 (b'\xc2\xc2\xc2', FFFD*3), # 2 start bytes
1238 (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1239 # with start byte of a 3-byte sequence
1240 (b'\xe1', FFFD), # only the start byte
1241 (b'\xe1\xe1', FFFD*2), # 2 start bytes
1242 (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1243 (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1244 (b'\xe1\x80', FFFD), # only 1 continuation byte
1245 (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1246 (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1247 (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1248 (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1249 (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1250 (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1251 # with start byte of a 4-byte sequence
1252 (b'\xf1', FFFD), # only the start byte
1253 (b'\xf1\xf1', FFFD*2), # 2 start bytes
1254 (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1255 (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1256 (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1257 (b'\xf1\x80', FFFD), # only 1 continuation bytes
1258 (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1259 (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1260 (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1261 (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1262 (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1263 (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1264 (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1265 (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1266 (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1267 (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1268 (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1269 (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1270 # with invalid start byte of a 4-byte sequence (rfc2279)
1271 (b'\xf5', FFFD), # only the start byte
1272 (b'\xf5\xf5', FFFD*2), # 2 start bytes
1273 (b'\xf5\x80', FFFD*2), # only 1 continuation byte
1274 (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1275 (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1276 (b'\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
1277 (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1278 (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1279 # with invalid start byte of a 5-byte sequence (rfc2279)
1280 (b'\xf8', FFFD), # only the start byte
1281 (b'\xf8\xf8', FFFD*2), # 2 start bytes
1282 (b'\xf8\x80', FFFD*2), # only one continuation byte
1283 (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1284 (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1285 # with invalid start byte of a 6-byte sequence (rfc2279)
1286 (b'\xfc', FFFD), # only the start byte
1287 (b'\xfc\xfc', FFFD*2), # 2 start bytes
1288 (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1289 (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1290 # invalid start byte
1291 (b'\xfe', FFFD),
1292 (b'\xfe\x80\x80', FFFD*3),
1293 # other sequences
1294 (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1295 (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1296 (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1297 (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1298 '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1299 ]
1300 for n, (seq, res) in enumerate(sequences):
1301 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1302 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1303 self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1304 self.assertEqual(seq.decode('utf-8', 'ignore'),
1305 res.replace('\uFFFD', ''))
1306
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00001307 def test_codecs_idna(self):
1308 # Test whether trailing dot is preserved
Walter Dörwald1324c6f2007-05-11 19:57:05 +00001309 self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00001310
Walter Dörwald28256f22003-01-19 16:59:20 +00001311 def test_codecs_errors(self):
1312 # Error handling (encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001313 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
1314 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
Walter Dörwald67e83882007-05-05 12:26:27 +00001315 self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
1316 self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
Benjamin Peterson308d6372009-09-18 21:42:35 +00001317 self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
1318 'Andr\202 x'.encode('ascii', errors='replace'))
1319 self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
1320 'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001321
Walter Dörwald28256f22003-01-19 16:59:20 +00001322 # Error handling (decoding)
Walter Dörwald67e83882007-05-05 12:26:27 +00001323 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
1324 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
1325 self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
1326 self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001327
Walter Dörwald28256f22003-01-19 16:59:20 +00001328 # Error handling (unknown character names)
Guido van Rossum39478e82007-08-27 17:23:59 +00001329 self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001330
Walter Dörwald28256f22003-01-19 16:59:20 +00001331 # Error handling (truncated escape sequence)
Guido van Rossum9c627722007-08-27 18:31:48 +00001332 self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00001333
Guido van Rossum9c627722007-08-27 18:31:48 +00001334 self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
1335 self.assertRaises(TypeError, str, b"hello", "test.unicode2")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001336 self.assertRaises(TypeError, "hello".encode, "test.unicode1")
1337 self.assertRaises(TypeError, "hello".encode, "test.unicode2")
Walter Dörwald28256f22003-01-19 16:59:20 +00001338 # executes PyUnicode_Encode()
1339 import imp
1340 self.assertRaises(
1341 ImportError,
1342 imp.find_module,
1343 "non-existing module",
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001344 ["non-existing dir"]
Walter Dörwald28256f22003-01-19 16:59:20 +00001345 )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00001346
Walter Dörwald28256f22003-01-19 16:59:20 +00001347 # Error handling (wrong arguments)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001348 self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001349
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001350 # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
1351 self.assertRaises(UnicodeError, int, "\ud800")
1352 self.assertRaises(UnicodeError, int, "\udf00")
1353 self.assertRaises(UnicodeError, float, "\ud800")
1354 self.assertRaises(UnicodeError, float, "\udf00")
1355 self.assertRaises(UnicodeError, complex, "\ud800")
1356 self.assertRaises(UnicodeError, complex, "\udf00")
Guido van Rossum97064862000-04-10 13:52:48 +00001357
Walter Dörwald28256f22003-01-19 16:59:20 +00001358 def test_codecs(self):
1359 # Encoding
Walter Dörwald67e83882007-05-05 12:26:27 +00001360 self.assertEqual('hello'.encode('ascii'), b'hello')
1361 self.assertEqual('hello'.encode('utf-7'), b'hello')
1362 self.assertEqual('hello'.encode('utf-8'), b'hello')
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001363 self.assertEqual('hello'.encode('utf-8'), b'hello')
Walter Dörwald67e83882007-05-05 12:26:27 +00001364 self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
1365 self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
1366 self.assertEqual('hello'.encode('latin-1'), b'hello')
Guido van Rossum97064862000-04-10 13:52:48 +00001367
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001368 # Default encoding is utf-8
1369 self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
1370
Walter Dörwald28256f22003-01-19 16:59:20 +00001371 # Roundtrip safety for BMP (just the first 1024 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00001372 for c in range(1024):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00001373 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001374 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
1375 'utf-16-be', 'raw_unicode_escape',
1376 'unicode_escape', 'unicode_internal'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001377 self.assertEqual(str(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001378
Walter Dörwald28256f22003-01-19 16:59:20 +00001379 # Roundtrip safety for BMP (just the first 256 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00001380 for c in range(256):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00001381 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001382 for encoding in ('latin-1',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001383 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001384
Walter Dörwald28256f22003-01-19 16:59:20 +00001385 # Roundtrip safety for BMP (just the first 128 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00001386 for c in range(128):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00001387 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001388 for encoding in ('ascii',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001389 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001390
Walter Dörwald28256f22003-01-19 16:59:20 +00001391 # Roundtrip safety for non-BMP (just a few chars)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001392 u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
Walter Dörwald28256f22003-01-19 16:59:20 +00001393 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
1394 #'raw_unicode_escape',
1395 'unicode_escape', 'unicode_internal'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001396 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001397
Walter Dörwald28256f22003-01-19 16:59:20 +00001398 # UTF-8 must be roundtrip safe for all UCS-2 code points
1399 # This excludes surrogates: in the full range, there would be
1400 # a surrogate pair (\udbff\udc00), which gets converted back
1401 # to a non-BMP character (\U0010fc00)
Walter Dörwald1324c6f2007-05-11 19:57:05 +00001402 u = ''.join(map(chr, list(range(0,0xd800)) +
1403 list(range(0xe000,0x10000))))
Walter Dörwald28256f22003-01-19 16:59:20 +00001404 for encoding in ('utf-8',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001405 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001406
Walter Dörwald28256f22003-01-19 16:59:20 +00001407 def test_codecs_charmap(self):
1408 # 0-127
Guido van Rossum805365e2007-05-07 22:24:25 +00001409 s = bytes(range(128))
Walter Dörwald28256f22003-01-19 16:59:20 +00001410 for encoding in (
1411 'cp037', 'cp1026',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001412 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1413 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Walter Dörwald28256f22003-01-19 16:59:20 +00001414 'cp863', 'cp865', 'cp866',
1415 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1416 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
1417 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
1418 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001419
Walter Dörwald28256f22003-01-19 16:59:20 +00001420 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1421 'cp1256', 'cp1257', 'cp1258',
1422 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001423
Walter Dörwald28256f22003-01-19 16:59:20 +00001424 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
1425 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +00001426
Walter Dörwald28256f22003-01-19 16:59:20 +00001427 ### These have undefined mappings:
1428 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +00001429
Walter Dörwald28256f22003-01-19 16:59:20 +00001430 ### These fail the round-trip:
1431 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +00001432
Walter Dörwald28256f22003-01-19 16:59:20 +00001433 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001434 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001435
Walter Dörwald28256f22003-01-19 16:59:20 +00001436 # 128-255
Guido van Rossum805365e2007-05-07 22:24:25 +00001437 s = bytes(range(128, 256))
Walter Dörwald28256f22003-01-19 16:59:20 +00001438 for encoding in (
1439 'cp037', 'cp1026',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001440 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1441 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Walter Dörwald28256f22003-01-19 16:59:20 +00001442 'cp863', 'cp865', 'cp866',
1443 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1444 'iso8859_2', 'iso8859_4', 'iso8859_5',
1445 'iso8859_9', 'koi8_r', 'latin_1',
1446 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +00001447
Walter Dörwald28256f22003-01-19 16:59:20 +00001448 ### These have undefined mappings:
1449 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1450 #'cp1256', 'cp1257', 'cp1258',
1451 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
1452 #'iso8859_3', 'iso8859_6', 'iso8859_7',
1453 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +00001454
Walter Dörwald28256f22003-01-19 16:59:20 +00001455 ### These fail the round-trip:
1456 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +00001457
Walter Dörwald28256f22003-01-19 16:59:20 +00001458 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001459 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001460
Walter Dörwald28256f22003-01-19 16:59:20 +00001461 def test_concatenation(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001462 self.assertEqual(("abc" "def"), "abcdef")
1463 self.assertEqual(("abc" "def"), "abcdef")
1464 self.assertEqual(("abc" "def"), "abcdef")
1465 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
1466 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +00001467
Walter Dörwald28256f22003-01-19 16:59:20 +00001468 def test_printing(self):
1469 class BitBucket:
1470 def write(self, text):
1471 pass
Fred Drake004d5e62000-10-23 17:22:08 +00001472
Walter Dörwald28256f22003-01-19 16:59:20 +00001473 out = BitBucket()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001474 print('abc', file=out)
1475 print('abc', 'def', file=out)
1476 print('abc', 'def', file=out)
1477 print('abc', 'def', file=out)
1478 print('abc\n', file=out)
1479 print('abc\n', end=' ', file=out)
1480 print('abc\n', end=' ', file=out)
1481 print('def\n', file=out)
1482 print('def\n', file=out)
Fred Drake004d5e62000-10-23 17:22:08 +00001483
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00001484 def test_ucs4(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001485 x = '\U00100000'
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00001486 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
1487 self.assertEqual(x, y)
1488
Florent Xiclunaa87b3832010-09-13 02:28:18 +00001489 y = br'\U00100000'
1490 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
1491 self.assertEqual(x, y)
1492 y = br'\U00010000'
1493 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
1494 self.assertEqual(x, y)
Christian Heimesfe337bf2008-03-23 21:54:12 +00001495
Florent Xiclunaa87b3832010-09-13 02:28:18 +00001496 try:
1497 br'\U11111111'.decode("raw-unicode-escape")
1498 except UnicodeDecodeError as e:
1499 self.assertEqual(e.start, 0)
1500 self.assertEqual(e.end, 10)
1501 else:
1502 self.fail("Should have raised UnicodeDecodeError")
Christian Heimesfe337bf2008-03-23 21:54:12 +00001503
Brett Cannonc3647ac2005-04-26 03:45:26 +00001504 def test_conversion(self):
1505 # Make sure __unicode__() works properly
1506 class Foo0:
1507 def __str__(self):
1508 return "foo"
1509
1510 class Foo1:
Guido van Rossum98297ee2007-11-06 21:34:58 +00001511 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001512 return "foo"
Brett Cannonc3647ac2005-04-26 03:45:26 +00001513
1514 class Foo2(object):
Guido van Rossum98297ee2007-11-06 21:34:58 +00001515 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001516 return "foo"
Brett Cannonc3647ac2005-04-26 03:45:26 +00001517
1518 class Foo3(object):
Guido van Rossum98297ee2007-11-06 21:34:58 +00001519 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001520 return "foo"
1521
1522 class Foo4(str):
Guido van Rossum98297ee2007-11-06 21:34:58 +00001523 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001524 return "foo"
1525
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001526 class Foo5(str):
Guido van Rossum98297ee2007-11-06 21:34:58 +00001527 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001528 return "foo"
1529
1530 class Foo6(str):
1531 def __str__(self):
1532 return "foos"
1533
Guido van Rossum98297ee2007-11-06 21:34:58 +00001534 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001535 return "foou"
Brett Cannonc3647ac2005-04-26 03:45:26 +00001536
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001537 class Foo7(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001538 def __str__(self):
1539 return "foos"
Guido van Rossum98297ee2007-11-06 21:34:58 +00001540 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001541 return "foou"
Brett Cannonc3647ac2005-04-26 03:45:26 +00001542
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001543 class Foo8(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001544 def __new__(cls, content=""):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001545 return str.__new__(cls, 2*content)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001546 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001547 return self
1548
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001549 class Foo9(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001550 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001551 return "not unicode"
1552
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001553 self.assertEqual(str(Foo0()), "foo")
1554 self.assertEqual(str(Foo1()), "foo")
1555 self.assertEqual(str(Foo2()), "foo")
1556 self.assertEqual(str(Foo3()), "foo")
1557 self.assertEqual(str(Foo4("bar")), "foo")
1558 self.assertEqual(str(Foo5("bar")), "foo")
1559 self.assertEqual(str(Foo6("bar")), "foou")
1560 self.assertEqual(str(Foo7("bar")), "foou")
1561 self.assertEqual(str(Foo8("foo")), "foofoo")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001562 self.assertEqual(str(Foo9("foo")), "not unicode")
Brett Cannonc3647ac2005-04-26 03:45:26 +00001563
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001564 def test_unicode_repr(self):
1565 class s1:
1566 def __repr__(self):
1567 return '\\n'
1568
1569 class s2:
1570 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001571 return '\\n'
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001572
1573 self.assertEqual(repr(s1()), '\\n')
1574 self.assertEqual(repr(s2()), '\\n')
1575
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001576 def test_printable_repr(self):
1577 self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001578 self.assertEqual(repr('\U00014000'), "'\\U00014000'") # nonprintable
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001579
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001580 def test_expandtabs_overflows_gracefully(self):
1581 # This test only affects 32-bit platforms because expandtabs can only take
1582 # an int as the max value, not a 64-bit C long. If expandtabs is changed
1583 # to take a 64-bit long, this test should apply to all platforms.
Christian Heimesa37d4c62007-12-04 23:02:19 +00001584 if sys.maxsize > (1 << 32) or struct.calcsize('P') != 4:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001585 return
Christian Heimesa37d4c62007-12-04 23:02:19 +00001586 self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001587
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00001588 def test_raiseMemError(self):
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589 if struct.calcsize('P') == 8:
1590 # 64 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02001591 ascii_struct_size = 48
1592 compact_struct_size = 72
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001593 else:
1594 # 32 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02001595 ascii_struct_size = 24
1596 compact_struct_size = 36
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597
1598 for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
1599 code = ord(char)
1600 if code < 0x100:
1601 char_size = 1 # sizeof(Py_UCS1)
1602 struct_size = ascii_struct_size
1603 elif code < 0x10000:
1604 char_size = 2 # sizeof(Py_UCS2)
1605 struct_size = compact_struct_size
1606 else:
1607 char_size = 4 # sizeof(Py_UCS4)
1608 struct_size = compact_struct_size
1609 # Note: sys.maxsize is half of the actual max allocation because of
Martin v. Löwis287eca62011-09-28 10:03:28 +02001610 # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
1611 # be allocatable, given enough memory.
1612 maxlen = ((sys.maxsize - struct_size) // char_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001613 alloc = lambda: char * maxlen
1614 self.assertRaises(MemoryError, alloc)
1615 self.assertRaises(MemoryError, alloc)
Antoine Pitrou3db3e872008-08-17 17:06:51 +00001616
Victor Stinner808fc0a2010-03-22 12:50:40 +00001617 def test_format_subclass(self):
1618 class S(str):
1619 def __str__(self):
1620 return '__str__ overridden'
1621 s = S('xxx')
Florent Xiclunaa87b3832010-09-13 02:28:18 +00001622 self.assertEqual("%s" % s, '__str__ overridden')
1623 self.assertEqual("{}".format(s), '__str__ overridden')
Victor Stinner808fc0a2010-03-22 12:50:40 +00001624
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00001625 # Test PyUnicode_FromFormat()
Victor Stinner1205f272010-09-11 00:54:47 +00001626 def test_from_format(self):
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00001627 support.import_module('ctypes')
Victor Stinner6d970f42011-03-02 00:04:25 +00001628 from ctypes import (pythonapi, py_object,
1629 c_int, c_long, c_longlong, c_ssize_t,
1630 c_uint, c_ulong, c_ulonglong, c_size_t)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631 name = "PyUnicode_FromFormat"
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00001632 _PyUnicode_FromFormat = getattr(pythonapi, name)
1633 _PyUnicode_FromFormat.restype = py_object
1634
1635 def PyUnicode_FromFormat(format, *args):
1636 cargs = tuple(
1637 py_object(arg) if isinstance(arg, str) else arg
1638 for arg in args)
1639 return _PyUnicode_FromFormat(format, *cargs)
Victor Stinner1205f272010-09-11 00:54:47 +00001640
1641 # ascii format, non-ascii argument
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00001642 text = PyUnicode_FromFormat(b'ascii\x7f=%U', 'unicode\xe9')
Victor Stinner1205f272010-09-11 00:54:47 +00001643 self.assertEqual(text, 'ascii\x7f=unicode\xe9')
1644
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00001645 # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
1646 # raises an error
Ezio Melottied3a7d22010-12-01 02:32:32 +00001647 self.assertRaisesRegex(ValueError,
Victor Stinner1205f272010-09-11 00:54:47 +00001648 '^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
Victor Stinner4c7db312010-09-12 07:51:18 +00001649 'string, got a non-ASCII byte: 0xe9$',
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00001650 PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00001651
Victor Stinner96865452011-03-01 23:44:09 +00001652 # test "%c"
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001653 self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0xabcd)), '\uabcd')
1654 self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0x10ffff)), '\U0010ffff')
1655
Victor Stinner96865452011-03-01 23:44:09 +00001656 # test "%"
1657 self.assertEqual(PyUnicode_FromFormat(b'%'), '%')
1658 self.assertEqual(PyUnicode_FromFormat(b'%%'), '%')
1659 self.assertEqual(PyUnicode_FromFormat(b'%%s'), '%s')
1660 self.assertEqual(PyUnicode_FromFormat(b'[%%]'), '[%]')
1661 self.assertEqual(PyUnicode_FromFormat(b'%%%s', b'abc'), '%abc')
1662
Victor Stinner6d970f42011-03-02 00:04:25 +00001663 # test integer formats (%i, %d, %u)
Victor Stinner96865452011-03-01 23:44:09 +00001664 self.assertEqual(PyUnicode_FromFormat(b'%03i', c_int(10)), '010')
1665 self.assertEqual(PyUnicode_FromFormat(b'%0.4i', c_int(10)), '0010')
Victor Stinner6d970f42011-03-02 00:04:25 +00001666 self.assertEqual(PyUnicode_FromFormat(b'%i', c_int(-123)), '-123')
1667 self.assertEqual(PyUnicode_FromFormat(b'%li', c_long(-123)), '-123')
1668 self.assertEqual(PyUnicode_FromFormat(b'%lli', c_longlong(-123)), '-123')
1669 self.assertEqual(PyUnicode_FromFormat(b'%zi', c_ssize_t(-123)), '-123')
Victor Stinner96865452011-03-01 23:44:09 +00001670
Victor Stinner6d970f42011-03-02 00:04:25 +00001671 self.assertEqual(PyUnicode_FromFormat(b'%d', c_int(-123)), '-123')
1672 self.assertEqual(PyUnicode_FromFormat(b'%ld', c_long(-123)), '-123')
1673 self.assertEqual(PyUnicode_FromFormat(b'%lld', c_longlong(-123)), '-123')
1674 self.assertEqual(PyUnicode_FromFormat(b'%zd', c_ssize_t(-123)), '-123')
Victor Stinner96865452011-03-01 23:44:09 +00001675
Victor Stinner6d970f42011-03-02 00:04:25 +00001676 self.assertEqual(PyUnicode_FromFormat(b'%u', c_uint(123)), '123')
1677 self.assertEqual(PyUnicode_FromFormat(b'%lu', c_ulong(123)), '123')
1678 self.assertEqual(PyUnicode_FromFormat(b'%llu', c_ulonglong(123)), '123')
1679 self.assertEqual(PyUnicode_FromFormat(b'%zu', c_size_t(123)), '123')
1680
1681 # test %A
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00001682 text = PyUnicode_FromFormat(b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
Victor Stinner9a909002010-10-18 20:59:24 +00001683 self.assertEqual(text, r"%A:'abc\xe9\uabcd\U0010ffff'")
1684
Victor Stinner6d970f42011-03-02 00:04:25 +00001685 # test %V
Victor Stinner2512a8b2011-03-01 22:46:52 +00001686 text = PyUnicode_FromFormat(b'repr=%V', 'abc', b'xyz')
1687 self.assertEqual(text, 'repr=abc')
1688
1689 # Test string decode from parameter of %s using utf-8.
1690 # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
1691 # '\u4eba\u6c11'
1692 text = PyUnicode_FromFormat(b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
1693 self.assertEqual(text, 'repr=\u4eba\u6c11')
1694
1695 #Test replace error handler.
1696 text = PyUnicode_FromFormat(b'repr=%V', None, b'abc\xff')
1697 self.assertEqual(text, 'repr=abc\ufffd')
1698
Victor Stinner6d970f42011-03-02 00:04:25 +00001699 # not supported: copy the raw format string. these tests are just here
1700 # to check for crashs and should not be considered as specifications
1701 self.assertEqual(PyUnicode_FromFormat(b'%1%s', b'abc'), '%s')
1702 self.assertEqual(PyUnicode_FromFormat(b'%1abc'), '%1abc')
1703 self.assertEqual(PyUnicode_FromFormat(b'%+i', c_int(10)), '%+i')
1704 self.assertEqual(PyUnicode_FromFormat(b'%.%s', b'abc'), '%.%s')
1705
Victor Stinner1c24bd02010-10-02 11:03:13 +00001706 # Test PyUnicode_AsWideChar()
1707 def test_aswidechar(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001708 from _testcapi import unicode_aswidechar
Antoine Pitrou0662bc22010-11-22 16:19:04 +00001709 support.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001710 from ctypes import c_wchar, sizeof
1711
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001712 wchar, size = unicode_aswidechar('abcdef', 2)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001713 self.assertEqual(size, 2)
1714 self.assertEqual(wchar, 'ab')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001715
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001716 wchar, size = unicode_aswidechar('abc', 3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001717 self.assertEqual(size, 3)
1718 self.assertEqual(wchar, 'abc')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001719
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001720 wchar, size = unicode_aswidechar('abc', 4)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001721 self.assertEqual(size, 3)
1722 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001723
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001724 wchar, size = unicode_aswidechar('abc', 10)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001725 self.assertEqual(size, 3)
1726 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001727
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001728 wchar, size = unicode_aswidechar('abc\0def', 20)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001729 self.assertEqual(size, 7)
1730 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001731
Victor Stinner5593d8a2010-10-02 11:11:27 +00001732 nonbmp = chr(0x10ffff)
1733 if sizeof(c_wchar) == 2:
1734 buflen = 3
1735 nchar = 2
1736 else: # sizeof(c_wchar) == 4
1737 buflen = 2
1738 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001739 wchar, size = unicode_aswidechar(nonbmp, buflen)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001740 self.assertEqual(size, nchar)
1741 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00001742
Victor Stinner1c24bd02010-10-02 11:03:13 +00001743 # Test PyUnicode_AsWideCharString()
1744 def test_aswidecharstring(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001745 from _testcapi import unicode_aswidecharstring
Antoine Pitrou0662bc22010-11-22 16:19:04 +00001746 support.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001747 from ctypes import c_wchar, sizeof
1748
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001749 wchar, size = unicode_aswidecharstring('abc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001750 self.assertEqual(size, 3)
1751 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001752
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001753 wchar, size = unicode_aswidecharstring('abc\0def')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001754 self.assertEqual(size, 7)
1755 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001756
Victor Stinner5593d8a2010-10-02 11:11:27 +00001757 nonbmp = chr(0x10ffff)
1758 if sizeof(c_wchar) == 2:
1759 nchar = 2
1760 else: # sizeof(c_wchar) == 4
1761 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001762 wchar, size = unicode_aswidecharstring(nonbmp)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001763 self.assertEqual(size, nchar)
1764 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00001765
Benjamin Peterson811c2f12011-09-30 21:31:21 -04001766 def test_subclass_add(self):
1767 class S(str):
1768 def __add__(self, o):
1769 return "3"
1770 self.assertEqual(S("4") + S("5"), "3")
1771 class S(str):
1772 def __iadd__(self, o):
1773 return "3"
1774 s = S("1")
1775 s += "4"
1776 self.assertEqual(s, "3")
1777
Victor Stinner1c24bd02010-10-02 11:03:13 +00001778
Eric Smitha1eac722011-01-29 11:15:35 +00001779class StringModuleTest(unittest.TestCase):
1780 def test_formatter_parser(self):
1781 def parse(format):
1782 return list(_string.formatter_parser(format))
1783
1784 formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
1785 self.assertEqual(formatter, [
1786 ('prefix ', '2', '', 's'),
1787 ('xxx', '0', '^+10.3f', None),
1788 ('', 'obj.attr', '', 's'),
1789 (' ', 'z[0]', '10', 's'),
1790 ])
1791
1792 formatter = parse("prefix {} suffix")
1793 self.assertEqual(formatter, [
1794 ('prefix ', '', '', None),
1795 (' suffix', None, None, None),
1796 ])
1797
1798 formatter = parse("str")
1799 self.assertEqual(formatter, [
1800 ('str', None, None, None),
1801 ])
1802
1803 formatter = parse("")
1804 self.assertEqual(formatter, [])
1805
1806 formatter = parse("{0}")
1807 self.assertEqual(formatter, [
1808 ('', '0', '', None),
1809 ])
1810
1811 self.assertRaises(TypeError, _string.formatter_parser, 1)
1812
1813 def test_formatter_field_name_split(self):
1814 def split(name):
1815 items = list(_string.formatter_field_name_split(name))
1816 items[1] = list(items[1])
1817 return items
1818 self.assertEqual(split("obj"), ["obj", []])
1819 self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
1820 self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
1821 self.assertEqual(split("obj.arg[key1][key2]"), [
1822 "obj",
1823 [(True, 'arg'),
1824 (False, 'key1'),
1825 (False, 'key2'),
1826 ]])
1827 self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
1828
1829
Walter Dörwald28256f22003-01-19 16:59:20 +00001830def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001831 support.run_unittest(__name__)
Barry Warsaw817918c2002-08-06 16:58:21 +00001832
Walter Dörwald28256f22003-01-19 16:59:20 +00001833if __name__ == "__main__":
1834 test_main()