blob: 9a5862de08948f2b5dea76a8f81de3b2fd5d00a4 [file] [log] [blame]
Guido van Rossuma831cac2000-03-10 23:23:21 +00001""" Test script for the Unicode implementation.
2
Guido van Rossuma831cac2000-03-10 23:23:21 +00003Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
Marc-André Lemburg36619082001-01-17 19:11:13 +00007"""#"
Guido van Rossum98297ee2007-11-06 21:34:58 +00008import codecs
9import struct
10import sys
11import unittest
12import warnings
Benjamin Petersonee8712c2008-05-20 21:35:26 +000013from test import support, string_tests
Eric Smitha1eac722011-01-29 11:15:35 +000014import _string
Guido van Rossuma831cac2000-03-10 23:23:21 +000015
Neal Norwitz430f68b2005-11-24 22:00:56 +000016# Error handling (bad decoder return)
17def search_function(encoding):
18 def decode1(input, errors="strict"):
19 return 42 # not a tuple
20 def encode1(input, errors="strict"):
21 return 42 # not a tuple
22 def encode2(input, errors="strict"):
23 return (42, 42) # no unicode
24 def decode2(input, errors="strict"):
25 return (42, 42) # no unicode
26 if encoding=="test.unicode1":
27 return (encode1, decode1, None, None)
28 elif encoding=="test.unicode2":
29 return (encode2, decode2, None, None)
30 else:
31 return None
32codecs.register(search_function)
33
Brett Cannon226b2302010-03-20 22:22:22 +000034class UnicodeTest(string_tests.CommonTest,
35 string_tests.MixinStrUnicodeUserStringTest,
36 string_tests.MixinStrUnicodeTest):
37
Guido van Rossumef87d6e2007-05-02 19:09:54 +000038 type2test = str
Walter Dörwald0fd583c2003-02-21 12:53:50 +000039
40 def checkequalnofix(self, result, object, methodname, *args):
41 method = getattr(object, methodname)
42 realresult = method(*args)
43 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000044 self.assertTrue(type(realresult) is type(result))
Walter Dörwald0fd583c2003-02-21 12:53:50 +000045
46 # if the original is returned make sure that
47 # this doesn't happen with subclasses
48 if realresult is object:
Guido van Rossumef87d6e2007-05-02 19:09:54 +000049 class usub(str):
Walter Dörwald0fd583c2003-02-21 12:53:50 +000050 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000051 return 'usub(%r)' % str.__repr__(self)
Walter Dörwald0fd583c2003-02-21 12:53:50 +000052 object = usub(object)
53 method = getattr(object, methodname)
54 realresult = method(*args)
55 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000056 self.assertTrue(object is not realresult)
Guido van Rossume4874ae2001-09-21 15:36:41 +000057
Jeremy Hylton504de6b2003-10-06 05:08:26 +000058 def test_literals(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000059 self.assertEqual('\xff', '\u00ff')
60 self.assertEqual('\uffff', '\U0000ffff')
Guido van Rossum36e0a922007-07-20 04:05:57 +000061 self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
62 self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
63 self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
Benjamin Petersoncd76c272008-04-05 15:09:30 +000064 # raw strings should not have unicode escapes
Florent Xiclunaa87b3832010-09-13 02:28:18 +000065 self.assertNotEqual(r"\u0020", " ")
Jeremy Hylton504de6b2003-10-06 05:08:26 +000066
Georg Brandl559e5d72008-06-11 18:37:52 +000067 def test_ascii(self):
68 if not sys.platform.startswith('java'):
69 # Test basic sanity of repr()
70 self.assertEqual(ascii('abc'), "'abc'")
71 self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
72 self.assertEqual(ascii('ab\\'), "'ab\\\\'")
73 self.assertEqual(ascii('\\c'), "'\\\\c'")
74 self.assertEqual(ascii('\\'), "'\\\\'")
75 self.assertEqual(ascii('\n'), "'\\n'")
76 self.assertEqual(ascii('\r'), "'\\r'")
77 self.assertEqual(ascii('\t'), "'\\t'")
78 self.assertEqual(ascii('\b'), "'\\x08'")
79 self.assertEqual(ascii("'\""), """'\\'"'""")
80 self.assertEqual(ascii("'\""), """'\\'"'""")
81 self.assertEqual(ascii("'"), '''"'"''')
82 self.assertEqual(ascii('"'), """'"'""")
83 latin1repr = (
84 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
85 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
86 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
87 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
88 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
89 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
90 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
91 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
92 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
93 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
94 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
95 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
96 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
97 "\\xfe\\xff'")
98 testrepr = ascii(''.join(map(chr, range(256))))
99 self.assertEqual(testrepr, latin1repr)
100 # Test ascii works on wide unicode escapes without overflow.
101 self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
102 ascii("\U00010000" * 39 + "\uffff" * 4096))
103
104 class WrongRepr:
105 def __repr__(self):
106 return b'byte-repr'
107 self.assertRaises(TypeError, ascii, WrongRepr())
108
Walter Dörwald28256f22003-01-19 16:59:20 +0000109 def test_repr(self):
110 if not sys.platform.startswith('java'):
111 # Test basic sanity of repr()
Walter Dörwald67e83882007-05-05 12:26:27 +0000112 self.assertEqual(repr('abc'), "'abc'")
113 self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
114 self.assertEqual(repr('ab\\'), "'ab\\\\'")
115 self.assertEqual(repr('\\c'), "'\\\\c'")
116 self.assertEqual(repr('\\'), "'\\\\'")
117 self.assertEqual(repr('\n'), "'\\n'")
118 self.assertEqual(repr('\r'), "'\\r'")
119 self.assertEqual(repr('\t'), "'\\t'")
120 self.assertEqual(repr('\b'), "'\\x08'")
121 self.assertEqual(repr("'\""), """'\\'"'""")
122 self.assertEqual(repr("'\""), """'\\'"'""")
123 self.assertEqual(repr("'"), '''"'"''')
124 self.assertEqual(repr('"'), """'"'""")
Walter Dörwald28256f22003-01-19 16:59:20 +0000125 latin1repr = (
Walter Dörwald67e83882007-05-05 12:26:27 +0000126 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
Walter Dörwald28256f22003-01-19 16:59:20 +0000127 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
128 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
129 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
130 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
131 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
Georg Brandl559e5d72008-06-11 18:37:52 +0000132 "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
133 "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
134 "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
135 "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
136 "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
137 "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
138 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
139 "\xfe\xff'")
Guido van Rossum805365e2007-05-07 22:24:25 +0000140 testrepr = repr(''.join(map(chr, range(256))))
Walter Dörwald28256f22003-01-19 16:59:20 +0000141 self.assertEqual(testrepr, latin1repr)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000142 # Test repr works on wide unicode escapes without overflow.
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000143 self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
144 repr("\U00010000" * 39 + "\uffff" * 4096))
Walter Dörwald28256f22003-01-19 16:59:20 +0000145
Georg Brandl559e5d72008-06-11 18:37:52 +0000146 class WrongRepr:
147 def __repr__(self):
148 return b'byte-repr'
149 self.assertRaises(TypeError, repr, WrongRepr())
150
Guido van Rossum49d6b072006-08-17 21:11:47 +0000151 def test_iterators(self):
152 # Make sure unicode objects have an __iter__ method
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000153 it = "\u1111\u2222\u3333".__iter__()
154 self.assertEqual(next(it), "\u1111")
155 self.assertEqual(next(it), "\u2222")
156 self.assertEqual(next(it), "\u3333")
Georg Brandla18af4e2007-04-21 15:47:16 +0000157 self.assertRaises(StopIteration, next, it)
Guido van Rossum49d6b072006-08-17 21:11:47 +0000158
Walter Dörwald28256f22003-01-19 16:59:20 +0000159 def test_count(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000160 string_tests.CommonTest.test_count(self)
161 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000162 self.checkequalnofix(3, 'aaa', 'count', 'a')
163 self.checkequalnofix(0, 'aaa', 'count', 'b')
164 self.checkequalnofix(3, 'aaa', 'count', 'a')
165 self.checkequalnofix(0, 'aaa', 'count', 'b')
166 self.checkequalnofix(0, 'aaa', 'count', 'b')
167 self.checkequalnofix(1, 'aaa', 'count', 'a', -1)
168 self.checkequalnofix(3, 'aaa', 'count', 'a', -10)
169 self.checkequalnofix(2, 'aaa', 'count', 'a', 0, -1)
170 self.checkequalnofix(0, 'aaa', 'count', 'a', 0, -10)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000171
Walter Dörwald28256f22003-01-19 16:59:20 +0000172 def test_find(self):
Antoine Pitrouc0bbe7d2011-10-08 22:41:35 +0200173 string_tests.CommonTest.test_find(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000174 self.checkequalnofix(0, 'abcdefghiabc', 'find', 'abc')
175 self.checkequalnofix(9, 'abcdefghiabc', 'find', 'abc', 1)
176 self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000177
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000178 self.assertRaises(TypeError, 'hello'.find)
179 self.assertRaises(TypeError, 'hello'.find, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000180
Walter Dörwald28256f22003-01-19 16:59:20 +0000181 def test_rfind(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000182 string_tests.CommonTest.test_rfind(self)
183 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000184 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', 'abc')
185 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
186 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
Guido van Rossum8b264542000-12-19 02:22:31 +0000187
Walter Dörwald28256f22003-01-19 16:59:20 +0000188 def test_index(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000189 string_tests.CommonTest.test_index(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000190 self.checkequalnofix(0, 'abcdefghiabc', 'index', '')
191 self.checkequalnofix(3, 'abcdefghiabc', 'index', 'def')
192 self.checkequalnofix(0, 'abcdefghiabc', 'index', 'abc')
193 self.checkequalnofix(9, 'abcdefghiabc', 'index', 'abc', 1)
194 self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
195 self.assertRaises(ValueError, 'abcdefghiab'.index, 'abc', 1)
196 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', 8)
197 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000198
Walter Dörwald28256f22003-01-19 16:59:20 +0000199 def test_rindex(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000200 string_tests.CommonTest.test_rindex(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000201 self.checkequalnofix(12, 'abcdefghiabc', 'rindex', '')
202 self.checkequalnofix(3, 'abcdefghiabc', 'rindex', 'def')
203 self.checkequalnofix(9, 'abcdefghiabc', 'rindex', 'abc')
204 self.checkequalnofix(0, 'abcdefghiabc', 'rindex', 'abc', 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000205
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000206 self.assertRaises(ValueError, 'abcdefghiabc'.rindex, 'hib')
207 self.assertRaises(ValueError, 'defghiabc'.rindex, 'def', 1)
208 self.assertRaises(ValueError, 'defghiabc'.rindex, 'abc', 0, -1)
209 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, 8)
210 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000211
Georg Brandlceee0772007-11-27 23:48:05 +0000212 def test_maketrans_translate(self):
213 # these work with plain translate()
214 self.checkequalnofix('bbbc', 'abababc', 'translate',
215 {ord('a'): None})
216 self.checkequalnofix('iiic', 'abababc', 'translate',
217 {ord('a'): None, ord('b'): ord('i')})
218 self.checkequalnofix('iiix', 'abababc', 'translate',
219 {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
220 self.checkequalnofix('c', 'abababc', 'translate',
221 {ord('a'): None, ord('b'): ''})
222 self.checkequalnofix('xyyx', 'xzx', 'translate',
223 {ord('z'): 'yy'})
224 # this needs maketrans()
225 self.checkequalnofix('abababc', 'abababc', 'translate',
226 {'b': '<i>'})
227 tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
228 self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
229 # test alternative way of calling maketrans()
230 tbl = self.type2test.maketrans('abc', 'xyz', 'd')
231 self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
232
233 self.assertRaises(TypeError, self.type2test.maketrans)
234 self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
235 self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
236 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
237 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
238 self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
239 self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000240
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000241 self.assertRaises(TypeError, 'hello'.translate)
Walter Dörwald67e83882007-05-05 12:26:27 +0000242 self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000243
Walter Dörwald28256f22003-01-19 16:59:20 +0000244 def test_split(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000245 string_tests.CommonTest.test_split(self)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000246
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000247 # Mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000248 self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
249 self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
250 self.checkequalnofix(['endcase ', ''], 'endcase test', 'split', 'test')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000251
Walter Dörwald28256f22003-01-19 16:59:20 +0000252 def test_join(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000253 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000254
Guido van Rossumf1044292007-09-27 18:01:22 +0000255 class MyWrapper:
256 def __init__(self, sval): self.sval = sval
257 def __str__(self): return self.sval
258
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000259 # mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000260 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
261 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
262 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
263 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
264 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
265 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
266 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
Guido van Rossum98297ee2007-11-06 21:34:58 +0000267 self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
268 self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
269 self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
270 self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
Marc-André Lemburge5034372000-08-08 08:04:29 +0000271
Walter Dörwald28256f22003-01-19 16:59:20 +0000272 def test_replace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000273 string_tests.CommonTest.test_replace(self)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000274
Walter Dörwald28256f22003-01-19 16:59:20 +0000275 # method call forwarded from str implementation because of unicode argument
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000276 self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
277 self.assertRaises(TypeError, 'replace'.replace, "r", 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000278
Victor Stinner59de0ee2011-10-07 10:01:28 +0200279 @support.cpython_only
280 def test_replace_id(self):
Victor Stinner1d972ad2011-10-07 13:31:46 +0200281 pattern = 'abc'
282 text = 'abc def'
283 self.assertIs(text.replace(pattern, pattern), text)
Victor Stinner59de0ee2011-10-07 10:01:28 +0200284
Guido van Rossum98297ee2007-11-06 21:34:58 +0000285 def test_bytes_comparison(self):
Brett Cannon226b2302010-03-20 22:22:22 +0000286 with support.check_warnings():
287 warnings.simplefilter('ignore', BytesWarning)
288 self.assertEqual('abc' == b'abc', False)
289 self.assertEqual('abc' != b'abc', True)
290 self.assertEqual('abc' == bytearray(b'abc'), False)
291 self.assertEqual('abc' != bytearray(b'abc'), True)
Brett Cannon40430012007-10-22 20:24:51 +0000292
Walter Dörwald28256f22003-01-19 16:59:20 +0000293 def test_comparison(self):
294 # Comparisons:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000295 self.assertEqual('abc', 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000296 self.assertTrue('abcd' > 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000297 self.assertTrue('abc' < 'abcd')
Walter Dörwald28256f22003-01-19 16:59:20 +0000298
299 if 0:
300 # Move these tests to a Unicode collation module test...
301 # Testing UTF-16 code point order comparisons...
302
303 # No surrogates, no fixup required.
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000304 self.assertTrue('\u0061' < '\u20ac')
Walter Dörwald28256f22003-01-19 16:59:20 +0000305 # Non surrogate below surrogate value, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000306 self.assertTrue('\u0061' < '\ud800\udc02')
Walter Dörwald28256f22003-01-19 16:59:20 +0000307
308 # Non surrogate above surrogate value, fixup required
309 def test_lecmp(s, s2):
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000310 self.assertTrue(s < s2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000311
312 def test_fixup(s):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000313 s2 = '\ud800\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000314 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000315 s2 = '\ud900\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000316 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000317 s2 = '\uda00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000318 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000319 s2 = '\udb00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000320 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000321 s2 = '\ud800\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000322 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000323 s2 = '\ud900\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000324 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000325 s2 = '\uda00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000326 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000327 s2 = '\udb00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000328 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000329 s2 = '\ud800\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000330 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000331 s2 = '\ud900\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000332 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000333 s2 = '\uda00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000334 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000335 s2 = '\udb00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000336 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000337 s2 = '\ud800\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000338 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000339 s2 = '\ud900\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000340 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000341 s2 = '\uda00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000342 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000343 s2 = '\udb00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000344 test_lecmp(s, s2)
345
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000346 test_fixup('\ue000')
347 test_fixup('\uff61')
Walter Dörwald28256f22003-01-19 16:59:20 +0000348
349 # Surrogates on both sides, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000350 self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
Walter Dörwald28256f22003-01-19 16:59:20 +0000351
Walter Dörwald28256f22003-01-19 16:59:20 +0000352 def test_islower(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000353 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000354 self.checkequalnofix(False, '\u1FFc', 'islower')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300355 # non-BMP, uppercase
356 self.assertFalse('\U00010401'.islower())
357 self.assertFalse('\U00010427'.islower())
358 # non-BMP, lowercase
359 self.assertTrue('\U00010429'.islower())
360 self.assertTrue('\U0001044E'.islower())
361 # non-BMP, non-cased
362 self.assertFalse('\U0001F40D'.islower())
363 self.assertFalse('\U0001F46F'.islower())
Walter Dörwald28256f22003-01-19 16:59:20 +0000364
365 def test_isupper(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000366 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
367 if not sys.platform.startswith('java'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000368 self.checkequalnofix(False, '\u1FFc', 'isupper')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300369 # non-BMP, uppercase
370 self.assertTrue('\U00010401'.isupper())
371 self.assertTrue('\U00010427'.isupper())
372 # non-BMP, lowercase
373 self.assertFalse('\U00010429'.isupper())
374 self.assertFalse('\U0001044E'.isupper())
375 # non-BMP, non-cased
376 self.assertFalse('\U0001F40D'.isupper())
377 self.assertFalse('\U0001F46F'.isupper())
Walter Dörwald28256f22003-01-19 16:59:20 +0000378
379 def test_istitle(self):
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300380 string_tests.MixinStrUnicodeUserStringTest.test_istitle(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000381 self.checkequalnofix(True, '\u1FFc', 'istitle')
382 self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
Walter Dörwald28256f22003-01-19 16:59:20 +0000383
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300384 # non-BMP, uppercase + lowercase
385 self.assertTrue('\U00010401\U00010429'.istitle())
386 self.assertTrue('\U00010427\U0001044E'.istitle())
387 # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
388 for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
389 self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
390
Walter Dörwald28256f22003-01-19 16:59:20 +0000391 def test_isspace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000392 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000393 self.checkequalnofix(True, '\u2000', 'isspace')
394 self.checkequalnofix(True, '\u200a', 'isspace')
395 self.checkequalnofix(False, '\u2014', 'isspace')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300396 # apparently there are no non-BMP spaces chars in Unicode 6
397 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
398 '\U0001F40D', '\U0001F46F']:
399 self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
400
401 def test_isalnum(self):
402 string_tests.MixinStrUnicodeUserStringTest.test_isalnum(self)
403 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
404 '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
405 self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
Walter Dörwald28256f22003-01-19 16:59:20 +0000406
407 def test_isalpha(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000408 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000409 self.checkequalnofix(True, '\u1FFc', 'isalpha')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300410 # non-BMP, cased
411 self.assertTrue('\U00010401'.isalpha())
412 self.assertTrue('\U00010427'.isalpha())
413 self.assertTrue('\U00010429'.isalpha())
414 self.assertTrue('\U0001044E'.isalpha())
415 # non-BMP, non-cased
416 self.assertFalse('\U0001F40D'.isalpha())
417 self.assertFalse('\U0001F46F'.isalpha())
Walter Dörwald28256f22003-01-19 16:59:20 +0000418
419 def test_isdecimal(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000420 self.checkequalnofix(False, '', 'isdecimal')
421 self.checkequalnofix(False, 'a', 'isdecimal')
422 self.checkequalnofix(True, '0', 'isdecimal')
423 self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
424 self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
425 self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
426 self.checkequalnofix(True, '0123456789', 'isdecimal')
427 self.checkequalnofix(False, '0123456789a', 'isdecimal')
Walter Dörwald28256f22003-01-19 16:59:20 +0000428
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000429 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000430
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300431 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
432 '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
433 self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
434 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
435 self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
436
Walter Dörwald28256f22003-01-19 16:59:20 +0000437 def test_isdigit(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000438 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000439 self.checkequalnofix(True, '\u2460', 'isdigit')
440 self.checkequalnofix(False, '\xbc', 'isdigit')
441 self.checkequalnofix(True, '\u0660', 'isdigit')
Walter Dörwald28256f22003-01-19 16:59:20 +0000442
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300443 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
444 '\U0001F40D', '\U0001F46F', '\U00011065']:
445 self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
446 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
447 self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
448
Walter Dörwald28256f22003-01-19 16:59:20 +0000449 def test_isnumeric(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000450 self.checkequalnofix(False, '', 'isnumeric')
451 self.checkequalnofix(False, 'a', 'isnumeric')
452 self.checkequalnofix(True, '0', 'isnumeric')
453 self.checkequalnofix(True, '\u2460', 'isnumeric')
454 self.checkequalnofix(True, '\xbc', 'isnumeric')
455 self.checkequalnofix(True, '\u0660', 'isnumeric')
456 self.checkequalnofix(True, '0123456789', 'isnumeric')
457 self.checkequalnofix(False, '0123456789a', 'isnumeric')
Walter Dörwald28256f22003-01-19 16:59:20 +0000458
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000459 self.assertRaises(TypeError, "abc".isnumeric, 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000460
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300461 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
462 '\U0001F40D', '\U0001F46F']:
463 self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
464 for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
465 '\U000104A0', '\U0001F107']:
466 self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
467
Martin v. Löwis47383402007-08-15 07:32:56 +0000468 def test_isidentifier(self):
469 self.assertTrue("a".isidentifier())
470 self.assertTrue("Z".isidentifier())
471 self.assertTrue("_".isidentifier())
472 self.assertTrue("b0".isidentifier())
473 self.assertTrue("bc".isidentifier())
474 self.assertTrue("b_".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000475 self.assertTrue("µ".isidentifier())
Benjamin Petersonf413b802011-08-12 22:17:18 -0500476 self.assertTrue("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000477
478 self.assertFalse(" ".isidentifier())
479 self.assertFalse("[".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000480 self.assertFalse("©".isidentifier())
Georg Brandld52429f2008-07-04 15:55:02 +0000481 self.assertFalse("0".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000482
Georg Brandl559e5d72008-06-11 18:37:52 +0000483 def test_isprintable(self):
484 self.assertTrue("".isprintable())
Benjamin Peterson09832742009-03-26 17:15:46 +0000485 self.assertTrue(" ".isprintable())
Georg Brandl559e5d72008-06-11 18:37:52 +0000486 self.assertTrue("abcdefg".isprintable())
487 self.assertFalse("abcdefg\n".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000488 # some defined Unicode character
489 self.assertTrue("\u0374".isprintable())
490 # undefined character
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +0000491 self.assertFalse("\u0378".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000492 # single surrogate character
Georg Brandl559e5d72008-06-11 18:37:52 +0000493 self.assertFalse("\ud800".isprintable())
494
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300495 self.assertTrue('\U0001F46F'.isprintable())
496 self.assertFalse('\U000E0020'.isprintable())
497
498 def test_surrogates(self):
499 for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
500 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
501 self.assertTrue(s.islower())
502 self.assertFalse(s.isupper())
503 self.assertFalse(s.istitle())
504 for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
505 'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
506 self.assertFalse(s.islower())
507 self.assertTrue(s.isupper())
508 self.assertTrue(s.istitle())
509
510 for meth_name in ('islower', 'isupper', 'istitle'):
511 meth = getattr(str, meth_name)
512 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
513 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
514
515 for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
516 'isdecimal', 'isnumeric',
517 'isidentifier', 'isprintable'):
518 meth = getattr(str, meth_name)
519 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
520 'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
521 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
522 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
523
524
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300525 def test_lower(self):
526 string_tests.CommonTest.test_lower(self)
527 self.assertEqual('\U00010427'.lower(), '\U0001044F')
528 self.assertEqual('\U00010427\U00010427'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300529 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300530 self.assertEqual('\U00010427\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300531 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300532 self.assertEqual('X\U00010427x\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300533 'x\U0001044Fx\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300534
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300535 def test_upper(self):
536 string_tests.CommonTest.test_upper(self)
537 self.assertEqual('\U0001044F'.upper(), '\U00010427')
538 self.assertEqual('\U0001044F\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300539 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300540 self.assertEqual('\U00010427\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300541 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300542 self.assertEqual('X\U00010427x\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300543 'X\U00010427X\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300544
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300545 def test_capitalize(self):
546 string_tests.CommonTest.test_capitalize(self)
547 self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
548 self.assertEqual('\U0001044F\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300549 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300550 self.assertEqual('\U00010427\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300551 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300552 self.assertEqual('\U0001044F\U00010427'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300553 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300554 self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300555 'X\U0001044Fx\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300556
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300557 def test_title(self):
558 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
559 self.assertEqual('\U0001044F'.title(), '\U00010427')
560 self.assertEqual('\U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300561 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300562 self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300563 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300564 self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300565 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300566 self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300567 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300568 self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300569 'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300570
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300571 def test_swapcase(self):
572 string_tests.CommonTest.test_swapcase(self)
573 self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
574 self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
575 self.assertEqual('\U0001044F\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300576 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300577 self.assertEqual('\U00010427\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300578 '\U0001044F\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300579 self.assertEqual('\U0001044F\U00010427'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300580 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300581 self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300582 'x\U0001044FX\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300583
Walter Dörwald28256f22003-01-19 16:59:20 +0000584 def test_contains(self):
585 # Testing Unicode contains method
Benjamin Peterson577473f2010-01-19 00:09:57 +0000586 self.assertIn('a', 'abdb')
587 self.assertIn('a', 'bdab')
588 self.assertIn('a', 'bdaba')
589 self.assertIn('a', 'bdba')
590 self.assertNotIn('a', 'bdb')
591 self.assertIn('a', 'bdba')
592 self.assertIn('a', ('a',1,None))
593 self.assertIn('a', (1,None,'a'))
594 self.assertIn('a', ('a',1,None))
595 self.assertIn('a', (1,None,'a'))
596 self.assertNotIn('a', ('x',1,'y'))
597 self.assertNotIn('a', ('x',1,None))
598 self.assertNotIn('abcd', 'abcxxxx')
599 self.assertIn('ab', 'abcd')
600 self.assertIn('ab', 'abc')
601 self.assertIn('ab', (1,None,'ab'))
602 self.assertIn('', 'abc')
603 self.assertIn('', '')
604 self.assertIn('', 'abc')
605 self.assertNotIn('\0', 'abc')
606 self.assertIn('\0', '\0abc')
607 self.assertIn('\0', 'abc\0')
608 self.assertIn('a', '\0abc')
609 self.assertIn('asdf', 'asdf')
610 self.assertNotIn('asdf', 'asd')
611 self.assertNotIn('asdf', '')
Walter Dörwald28256f22003-01-19 16:59:20 +0000612
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000613 self.assertRaises(TypeError, "abc".__contains__)
Walter Dörwald28256f22003-01-19 16:59:20 +0000614
Eric Smith8c663262007-08-25 02:26:07 +0000615 def test_format(self):
616 self.assertEqual(''.format(), '')
617 self.assertEqual('a'.format(), 'a')
618 self.assertEqual('ab'.format(), 'ab')
619 self.assertEqual('a{{'.format(), 'a{')
620 self.assertEqual('a}}'.format(), 'a}')
621 self.assertEqual('{{b'.format(), '{b')
622 self.assertEqual('}}b'.format(), '}b')
623 self.assertEqual('a{{b'.format(), 'a{b')
624
625 # examples from the PEP:
626 import datetime
627 self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
628 self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
629 "My name is Fred")
630 self.assertEqual("My name is {0} :-{{}}".format('Fred'),
631 "My name is Fred :-{}")
632
633 d = datetime.date(2007, 8, 18)
634 self.assertEqual("The year is {0.year}".format(d),
635 "The year is 2007")
636
Eric Smith8c663262007-08-25 02:26:07 +0000637 # classes we'll use for testing
638 class C:
639 def __init__(self, x=100):
640 self._x = x
641 def __format__(self, spec):
642 return spec
643
644 class D:
645 def __init__(self, x):
646 self.x = x
647 def __format__(self, spec):
648 return str(self.x)
649
650 # class with __str__, but no __format__
651 class E:
652 def __init__(self, x):
653 self.x = x
654 def __str__(self):
655 return 'E(' + self.x + ')'
656
657 # class with __repr__, but no __format__ or __str__
658 class F:
659 def __init__(self, x):
660 self.x = x
661 def __repr__(self):
662 return 'F(' + self.x + ')'
663
664 # class with __format__ that forwards to string, for some format_spec's
665 class G:
666 def __init__(self, x):
667 self.x = x
668 def __str__(self):
669 return "string is " + self.x
670 def __format__(self, format_spec):
671 if format_spec == 'd':
672 return 'G(' + self.x + ')'
673 return object.__format__(self, format_spec)
674
Eric Smith739e2ad2007-08-27 19:07:22 +0000675 class I(datetime.date):
676 def __format__(self, format_spec):
677 return self.strftime(format_spec)
678
Eric Smith185e30c2007-08-30 22:23:08 +0000679 class J(int):
680 def __format__(self, format_spec):
681 return int.__format__(self * 2, format_spec)
682
Eric Smith8c663262007-08-25 02:26:07 +0000683
684 self.assertEqual(''.format(), '')
685 self.assertEqual('abc'.format(), 'abc')
686 self.assertEqual('{0}'.format('abc'), 'abc')
687 self.assertEqual('{0:}'.format('abc'), 'abc')
688# self.assertEqual('{ 0 }'.format('abc'), 'abc')
689 self.assertEqual('X{0}'.format('abc'), 'Xabc')
690 self.assertEqual('{0}X'.format('abc'), 'abcX')
691 self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
692 self.assertEqual('{1}'.format(1, 'abc'), 'abc')
693 self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
694 self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
695 self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
696 self.assertEqual('{0}'.format(-15), '-15')
697 self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
698 self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
699 self.assertEqual('{{'.format(), '{')
700 self.assertEqual('}}'.format(), '}')
701 self.assertEqual('{{}}'.format(), '{}')
702 self.assertEqual('{{x}}'.format(), '{x}')
703 self.assertEqual('{{{0}}}'.format(123), '{123}')
704 self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
705 self.assertEqual('}}{{'.format(), '}{')
706 self.assertEqual('}}x{{'.format(), '}x{')
707
Eric Smith7ade6482007-08-26 22:27:13 +0000708 # weird field names
709 self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
710 self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
Eric Smith4cb4e4e2007-09-03 08:40:29 +0000711 self.assertEqual("{0[ ]}".format({' ':3}), '3')
Eric Smith7ade6482007-08-26 22:27:13 +0000712
Eric Smith8c663262007-08-25 02:26:07 +0000713 self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
714 self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
715 self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
716 self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
717 self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
718 self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
719 self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
720
Eric Smith8c663262007-08-25 02:26:07 +0000721 # strings
722 self.assertEqual('{0:.3s}'.format('abc'), 'abc')
723 self.assertEqual('{0:.3s}'.format('ab'), 'ab')
724 self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
725 self.assertEqual('{0:.0s}'.format('abcdef'), '')
726 self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
727 self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
728 self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
729 self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
730 self.assertEqual('{0:x<0s}'.format('result'), 'result')
731 self.assertEqual('{0:x<5s}'.format('result'), 'result')
732 self.assertEqual('{0:x<6s}'.format('result'), 'result')
733 self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
734 self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
735 self.assertEqual('{0: <7s}'.format('result'), 'result ')
736 self.assertEqual('{0:<7s}'.format('result'), 'result ')
737 self.assertEqual('{0:>7s}'.format('result'), ' result')
738 self.assertEqual('{0:>8s}'.format('result'), ' result')
739 self.assertEqual('{0:^8s}'.format('result'), ' result ')
740 self.assertEqual('{0:^9s}'.format('result'), ' result ')
741 self.assertEqual('{0:^10s}'.format('result'), ' result ')
742 self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
743 self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
744 self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
745
746 # format specifiers for user defined type
747 self.assertEqual('{0:abc}'.format(C()), 'abc')
748
Georg Brandld52429f2008-07-04 15:55:02 +0000749 # !r, !s and !a coercions
Eric Smith8c663262007-08-25 02:26:07 +0000750 self.assertEqual('{0!s}'.format('Hello'), 'Hello')
751 self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
752 self.assertEqual('{0!s:15}'.format('Hello'), 'Hello ')
753 self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello ')
754 self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
755 self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
756 self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +0000757 self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +0000758 self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'") # printable
759 self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
Georg Brandl559e5d72008-06-11 18:37:52 +0000760 self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +0000761 self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +0000762 self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
Georg Brandl559e5d72008-06-11 18:37:52 +0000763 self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
764 self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
Georg Brandld52429f2008-07-04 15:55:02 +0000765 self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
Eric Smith8c663262007-08-25 02:26:07 +0000766
Eric Smith8c663262007-08-25 02:26:07 +0000767 # test fallback to object.__format__
768 self.assertEqual('{0}'.format({}), '{}')
769 self.assertEqual('{0}'.format([]), '[]')
770 self.assertEqual('{0}'.format([1]), '[1]')
Eric Smithe4d63172010-09-13 20:48:43 +0000771
Eric Smith8c663262007-08-25 02:26:07 +0000772 self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
Eric Smith8c663262007-08-25 02:26:07 +0000773 self.assertEqual('{0!s}'.format(G('data')), 'string is data')
774
Eric Smithe4d63172010-09-13 20:48:43 +0000775 msg = 'object.__format__ with a non-empty format string is deprecated'
Eric V. Smithb9cd3532011-03-12 10:08:48 -0500776 with support.check_warnings((msg, DeprecationWarning)):
Eric Smithe4d63172010-09-13 20:48:43 +0000777 self.assertEqual('{0:^10}'.format(E('data')), ' E(data) ')
778 self.assertEqual('{0:^10s}'.format(E('data')), ' E(data) ')
779 self.assertEqual('{0:>15s}'.format(G('data')), ' string is data')
780
Eric Smith739e2ad2007-08-27 19:07:22 +0000781 self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
782 month=8,
783 day=27)),
784 "date: 2007-08-27")
785
Eric Smith185e30c2007-08-30 22:23:08 +0000786 # test deriving from a builtin type and overriding __format__
787 self.assertEqual("{0}".format(J(10)), "20")
788
789
Eric Smith8c663262007-08-25 02:26:07 +0000790 # string format specifiers
791 self.assertEqual('{0:}'.format('a'), 'a')
792
793 # computed format specifiers
794 self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
795 self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
796 self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
797 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello ')
798 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello ')
799
800 # test various errors
801 self.assertRaises(ValueError, '{'.format)
802 self.assertRaises(ValueError, '}'.format)
803 self.assertRaises(ValueError, 'a{'.format)
804 self.assertRaises(ValueError, 'a}'.format)
805 self.assertRaises(ValueError, '{a'.format)
806 self.assertRaises(ValueError, '}a'.format)
Eric Smith11529192007-09-04 23:04:22 +0000807 self.assertRaises(IndexError, '{0}'.format)
808 self.assertRaises(IndexError, '{1}'.format, 'abc')
809 self.assertRaises(KeyError, '{x}'.format)
Eric Smith8c663262007-08-25 02:26:07 +0000810 self.assertRaises(ValueError, "}{".format)
Eric Smith8c663262007-08-25 02:26:07 +0000811 self.assertRaises(ValueError, "abc{0:{}".format)
812 self.assertRaises(ValueError, "{0".format)
Eric Smith11529192007-09-04 23:04:22 +0000813 self.assertRaises(IndexError, "{0.}".format)
814 self.assertRaises(ValueError, "{0.}".format, 0)
815 self.assertRaises(IndexError, "{0[}".format)
Eric Smith4cb4e4e2007-09-03 08:40:29 +0000816 self.assertRaises(ValueError, "{0[}".format, [])
Eric Smith11529192007-09-04 23:04:22 +0000817 self.assertRaises(KeyError, "{0]}".format)
818 self.assertRaises(ValueError, "{0.[]}".format, 0)
Eric Smith7ade6482007-08-26 22:27:13 +0000819 self.assertRaises(ValueError, "{0..foo}".format, 0)
Eric Smith11529192007-09-04 23:04:22 +0000820 self.assertRaises(ValueError, "{0[0}".format, 0)
821 self.assertRaises(ValueError, "{0[0:foo}".format, 0)
822 self.assertRaises(KeyError, "{c]}".format)
823 self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
824 self.assertRaises(ValueError, "{0}}".format, 0)
825 self.assertRaises(KeyError, "{foo}".format, bar=3)
Eric Smith8c663262007-08-25 02:26:07 +0000826 self.assertRaises(ValueError, "{0!x}".format, 3)
Eric Smith11529192007-09-04 23:04:22 +0000827 self.assertRaises(ValueError, "{0!}".format, 0)
828 self.assertRaises(ValueError, "{0!rs}".format, 0)
Eric Smith8c663262007-08-25 02:26:07 +0000829 self.assertRaises(ValueError, "{!}".format)
Eric Smith8ec90442009-03-14 12:29:34 +0000830 self.assertRaises(IndexError, "{:}".format)
831 self.assertRaises(IndexError, "{:s}".format)
832 self.assertRaises(IndexError, "{}".format)
Benjamin Peterson59a1b2f2010-06-07 22:31:26 +0000833 big = "23098475029384702983476098230754973209482573"
834 self.assertRaises(ValueError, ("{" + big + "}").format)
835 self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
Eric Smith8c663262007-08-25 02:26:07 +0000836
Eric Smith41669ca2009-05-23 14:23:22 +0000837 # issue 6089
838 self.assertRaises(ValueError, "{0[0]x}".format, [None])
839 self.assertRaises(ValueError, "{0[0](10)}".format, [None])
840
Eric Smith8c663262007-08-25 02:26:07 +0000841 # can't have a replacement on the field name portion
842 self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
843
844 # exceed maximum recursion depth
845 self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
846 self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
847 0, 1, 2, 3, 4, 5, 6, 7)
848
849 # string format spec errors
850 self.assertRaises(ValueError, "{0:-s}".format, '')
851 self.assertRaises(ValueError, format, "", "-")
852 self.assertRaises(ValueError, "{0:=s}".format, '')
853
Eric Smithb1ebcc62008-07-15 13:02:41 +0000854 # Alternate formatting is not supported
855 self.assertRaises(ValueError, format, '', '#')
856 self.assertRaises(ValueError, format, '', '#20')
857
Eric Smith27bbca62010-11-04 17:06:58 +0000858 def test_format_map(self):
859 self.assertEqual(''.format_map({}), '')
860 self.assertEqual('a'.format_map({}), 'a')
861 self.assertEqual('ab'.format_map({}), 'ab')
862 self.assertEqual('a{{'.format_map({}), 'a{')
863 self.assertEqual('a}}'.format_map({}), 'a}')
864 self.assertEqual('{{b'.format_map({}), '{b')
865 self.assertEqual('}}b'.format_map({}), '}b')
866 self.assertEqual('a{{b'.format_map({}), 'a{b')
867
868 # using mappings
869 class Mapping(dict):
870 def __missing__(self, key):
871 return key
872 self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
873 self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
874
875 class InternalMapping:
876 def __init__(self):
877 self.mapping = {'a': 'hello'}
878 def __getitem__(self, key):
879 return self.mapping[key]
880 self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
881
882
Eric Smith27bbca62010-11-04 17:06:58 +0000883 class C:
884 def __init__(self, x=100):
885 self._x = x
886 def __format__(self, spec):
887 return spec
Eric Smith27bbca62010-11-04 17:06:58 +0000888 self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
889
890 # test various errors
891 self.assertRaises(TypeError, '{'.format_map)
892 self.assertRaises(TypeError, '}'.format_map)
893 self.assertRaises(TypeError, 'a{'.format_map)
894 self.assertRaises(TypeError, 'a}'.format_map)
895 self.assertRaises(TypeError, '{a'.format_map)
896 self.assertRaises(TypeError, '}a'.format_map)
897
Eric V. Smith12ebefc2011-07-18 14:03:41 -0400898 # issue #12579: can't supply positional params to format_map
899 self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
900 self.assertRaises(ValueError, '{}'.format_map, 'a')
901 self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
902
Eric Smith8ec90442009-03-14 12:29:34 +0000903 def test_format_auto_numbering(self):
904 class C:
905 def __init__(self, x=100):
906 self._x = x
907 def __format__(self, spec):
908 return spec
909
910 self.assertEqual('{}'.format(10), '10')
911 self.assertEqual('{:5}'.format('s'), 's ')
912 self.assertEqual('{!r}'.format('s'), "'s'")
913 self.assertEqual('{._x}'.format(C(10)), '10')
914 self.assertEqual('{[1]}'.format([1, 2]), '2')
915 self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
916 self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
917
918 self.assertEqual('a{:{}}b'.format('x', '^10'), 'a x b')
919 self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
920
921 # can't mix and match numbering and auto-numbering
922 self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
923 self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
924 self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
925 self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
926
927 # can mix and match auto-numbering and named
928 self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
929 self.assertEqual('{}{f}'.format(4, f='test'), '4test')
930 self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
931 self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
932
Walter Dörwald28256f22003-01-19 16:59:20 +0000933 def test_formatting(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000934 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
Walter Dörwald28256f22003-01-19 16:59:20 +0000935 # Testing Unicode formatting strings...
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000936 self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
937 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000, 3.00')
938 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000, 3.00')
939 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000, 3.50')
940 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000, 3.57')
941 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
Walter Dörwald28256f22003-01-19 16:59:20 +0000942 if not sys.platform.startswith('java'):
Walter Dörwald67e83882007-05-05 12:26:27 +0000943 self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
Georg Brandl559e5d72008-06-11 18:37:52 +0000944 self.assertEqual("%r" % ("\u1234",), "'\u1234'")
945 self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000946 self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
947 self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
Walter Dörwald56fbcb52003-03-31 18:18:41 +0000948
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000949 self.assertEqual('%c' % 0x1234, '\u1234')
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +0000950 self.assertEqual('%c' % 0x21483, '\U00021483')
951 self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
952 self.assertEqual('%c' % '\U00021483', '\U00021483')
953 self.assertRaises(TypeError, "%c".__mod__, "aa")
Stefan Krah99212f62010-07-19 17:58:26 +0000954 self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -0700955 self.assertRaises(TypeError, "%i".__mod__, "aa")
Walter Dörwald28256f22003-01-19 16:59:20 +0000956
957 # formatting jobs delegated from the string implementation:
Walter Dörwald28256f22003-01-19 16:59:20 +0000958 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000959 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
960 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
961 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
962 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
963 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
964 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
965 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
966 self.assertEqual('...%s...' % "abc", '...abc...')
967 self.assertEqual('%*s' % (5,'abc',), ' abc')
968 self.assertEqual('%*s' % (-5,'abc',), 'abc ')
969 self.assertEqual('%*.*s' % (5,2,'abc',), ' ab')
970 self.assertEqual('%*.*s' % (5,3,'abc',), ' abc')
971 self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10 abc')
972 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103 abc')
973 self.assertEqual('%c' % 'a', 'a')
Neil Schemenauercf52c072005-08-12 17:34:58 +0000974 class Wrapper:
975 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000976 return '\u1234'
977 self.assertEqual('%s' % Wrapper(), '\u1234')
Walter Dörwald28256f22003-01-19 16:59:20 +0000978
Eric Smith741191f2009-05-06 13:08:15 +0000979 # issue 3382
980 NAN = float('nan')
981 INF = float('inf')
982 self.assertEqual('%f' % NAN, 'nan')
983 self.assertEqual('%F' % NAN, 'NAN')
984 self.assertEqual('%f' % INF, 'inf')
985 self.assertEqual('%F' % INF, 'INF')
986
Ezio Melottiba42fd52011-04-26 06:09:45 +0300987 def test_startswith_endswith_errors(self):
988 for meth in ('foo'.startswith, 'foo'.endswith):
Ezio Melottif2b3f782011-04-26 06:40:59 +0300989 with self.assertRaises(TypeError) as cm:
Ezio Melottiba42fd52011-04-26 06:09:45 +0300990 meth(['f'])
Ezio Melottif2b3f782011-04-26 06:40:59 +0300991 exc = str(cm.exception)
Ezio Melottiba42fd52011-04-26 06:09:45 +0300992 self.assertIn('str', exc)
993 self.assertIn('tuple', exc)
994
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000995 @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
Georg Brandlda6b1072006-01-20 17:48:54 +0000996 def test_format_float(self):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000997 # should not format with a comma, but always with C locale
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000998 self.assertEqual('1.0', '%.1f' % 1.0)
Georg Brandlda6b1072006-01-20 17:48:54 +0000999
Walter Dörwald28256f22003-01-19 16:59:20 +00001000 def test_constructor(self):
1001 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1002
1003 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001004 str('unicode remains unicode'),
1005 'unicode remains unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001006 )
1007
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001008 class UnicodeSubclass(str):
Marc-André Lemburg79f57832002-12-29 19:44:06 +00001009 pass
Guido van Rossuma831cac2000-03-10 23:23:21 +00001010
Victor Stinner07ac3eb2011-10-01 16:16:43 +02001011 for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
1012 subclass = UnicodeSubclass(text)
1013 self.assertEqual(str(subclass), text)
1014 self.assertEqual(len(subclass), len(text))
1015 if text == 'ascii':
1016 self.assertEqual(subclass.encode('ascii'), b'ascii')
1017 self.assertEqual(subclass.encode('utf-8'), b'ascii')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001018
Walter Dörwald28256f22003-01-19 16:59:20 +00001019 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001020 str('strings are converted to unicode'),
1021 'strings are converted to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001022 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001023
Walter Dörwald28256f22003-01-19 16:59:20 +00001024 class StringCompat:
1025 def __init__(self, x):
1026 self.x = x
1027 def __str__(self):
1028 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001029
Walter Dörwald28256f22003-01-19 16:59:20 +00001030 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001031 str(StringCompat('__str__ compatible objects are recognized')),
1032 '__str__ compatible objects are recognized'
Walter Dörwald28256f22003-01-19 16:59:20 +00001033 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001034
Walter Dörwald28256f22003-01-19 16:59:20 +00001035 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001036
Walter Dörwald28256f22003-01-19 16:59:20 +00001037 o = StringCompat('unicode(obj) is compatible to str()')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001038 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Walter Dörwald28256f22003-01-19 16:59:20 +00001039 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001040
Guido van Rossume2a383d2007-01-15 16:59:06 +00001041 for obj in (123, 123.45, 123):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001042 self.assertEqual(str(obj), str(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001043
Walter Dörwald28256f22003-01-19 16:59:20 +00001044 # unicode(obj, encoding, error) tests (this maps to
1045 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001046
Walter Dörwald28256f22003-01-19 16:59:20 +00001047 if not sys.platform.startswith('java'):
1048 self.assertRaises(
1049 TypeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001050 str,
1051 'decoding unicode is not supported',
Walter Dörwald28256f22003-01-19 16:59:20 +00001052 'utf-8',
1053 'strict'
1054 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001055
Walter Dörwald28256f22003-01-19 16:59:20 +00001056 self.assertEqual(
Walter Dörwald67e83882007-05-05 12:26:27 +00001057 str(b'strings are decoded to unicode', 'utf-8', 'strict'),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001058 'strings are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001059 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001060
Walter Dörwald28256f22003-01-19 16:59:20 +00001061 if not sys.platform.startswith('java'):
1062 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001063 str(
Guido van Rossumbae07c92007-10-08 02:46:15 +00001064 memoryview(b'character buffers are decoded to unicode'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001065 'utf-8',
1066 'strict'
1067 ),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001068 'character buffers are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001069 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001070
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001071 self.assertRaises(TypeError, str, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001072
Walter Dörwald28256f22003-01-19 16:59:20 +00001073 def test_codecs_utf7(self):
1074 utfTests = [
Walter Dörwald67e83882007-05-05 12:26:27 +00001075 ('A\u2262\u0391.', b'A+ImIDkQ.'), # RFC2152 example
1076 ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'), # RFC2152 example
1077 ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'), # RFC2152 example
1078 ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1079 ('+', b'+-'),
1080 ('+-', b'+--'),
1081 ('+?', b'+-?'),
1082 ('\?', b'+AFw?'),
1083 ('+?', b'+-?'),
1084 (r'\\?', b'+AFwAXA?'),
1085 (r'\\\?', b'+AFwAXABc?'),
Antoine Pitrou244651a2009-05-04 18:56:13 +00001086 (r'++--', b'+-+---'),
1087 ('\U000abcde', b'+2m/c3g-'), # surrogate pairs
1088 ('/', b'/'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001089 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001090
Walter Dörwald28256f22003-01-19 16:59:20 +00001091 for (x, y) in utfTests:
1092 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001093
Antoine Pitrou244651a2009-05-04 18:56:13 +00001094 # Unpaired surrogates not supported
Walter Dörwald67e83882007-05-05 12:26:27 +00001095 self.assertRaises(UnicodeError, str, b'+3ADYAA-', 'utf-7')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001096
Antoine Pitrou244651a2009-05-04 18:56:13 +00001097 self.assertEqual(str(b'+3ADYAA-', 'utf-7', 'replace'), '\ufffd\ufffd')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001098
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001099 # Issue #2242: crash on some Windows/MSVC versions
Antoine Pitrou244651a2009-05-04 18:56:13 +00001100 self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1')
1101
1102 # Direct encoded characters
1103 set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1104 # Optional direct characters
1105 set_o = '!"#$%&*;<=>@[]^_`{|}'
1106 for c in set_d:
1107 self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1108 self.assertEqual(c.encode('ascii').decode('utf7'), c)
1109 for c in set_o:
1110 self.assertEqual(c.encode('ascii').decode('utf7'), c)
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001111
Walter Dörwald28256f22003-01-19 16:59:20 +00001112 def test_codecs_utf8(self):
Walter Dörwald67e83882007-05-05 12:26:27 +00001113 self.assertEqual(''.encode('utf-8'), b'')
1114 self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
Ezio Melottia9860ae2011-10-04 19:06:00 +03001115 self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82')
1116 self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96')
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001117 self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1118 self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
Ezio Melottia9860ae2011-10-04 19:06:00 +03001119 self.assertEqual(('\U00010002'*10).encode('utf-8'),
1120 b'\xf0\x90\x80\x82'*10)
Walter Dörwald28256f22003-01-19 16:59:20 +00001121 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001122 '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1123 '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1124 '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1125 '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1126 '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1127 ' Nunstuck git und'.encode('utf-8'),
Walter Dörwald67e83882007-05-05 12:26:27 +00001128 b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1129 b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1130 b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1131 b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1132 b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1133 b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1134 b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1135 b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1136 b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1137 b'\xe3\x80\x8cWenn ist das Nunstuck git und'
Walter Dörwald28256f22003-01-19 16:59:20 +00001138 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001139
Walter Dörwald28256f22003-01-19 16:59:20 +00001140 # UTF-8 specific decoding tests
Walter Dörwald67e83882007-05-05 12:26:27 +00001141 self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1142 self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1143 self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001144
Walter Dörwald28256f22003-01-19 16:59:20 +00001145 # Other possible utf-8 test cases:
1146 # * strict decoding testing for all of the
1147 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001148
Ezio Melotti57221d02010-07-01 07:32:02 +00001149 def test_utf8_decode_valid_sequences(self):
1150 sequences = [
1151 # single byte
1152 (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1153 # 2 bytes
1154 (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1155 # 3 bytes
1156 (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1157 (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1158 # 4 bytes
1159 (b'\xF0\x90\x80\x80', '\U00010000'),
1160 (b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1161 ]
1162 for seq, res in sequences:
1163 self.assertEqual(seq.decode('utf-8'), res)
1164
1165
1166 def test_utf8_decode_invalid_sequences(self):
1167 # continuation bytes in a sequence of 2, 3, or 4 bytes
1168 continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
1169 # start bytes of a 2-byte sequence equivalent to codepoints < 0x7F
1170 invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
1171 # start bytes of a 4-byte sequence equivalent to codepoints > 0x10FFFF
1172 invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1173 invalid_start_bytes = (
1174 continuation_bytes + invalid_2B_seq_start_bytes +
1175 invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1176 )
1177
1178 for byte in invalid_start_bytes:
1179 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1180
1181 for sb in invalid_2B_seq_start_bytes:
1182 for cb in continuation_bytes:
1183 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1184
1185 for sb in invalid_4B_seq_start_bytes:
1186 for cb1 in continuation_bytes[:3]:
1187 for cb3 in continuation_bytes[:3]:
1188 self.assertRaises(UnicodeDecodeError,
1189 (sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1190
1191 for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1192 self.assertRaises(UnicodeDecodeError,
1193 (b'\xE0'+cb+b'\x80').decode, 'utf-8')
1194 self.assertRaises(UnicodeDecodeError,
1195 (b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1196 # surrogates
1197 for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1198 self.assertRaises(UnicodeDecodeError,
1199 (b'\xED'+cb+b'\x80').decode, 'utf-8')
1200 self.assertRaises(UnicodeDecodeError,
1201 (b'\xED'+cb+b'\xBF').decode, 'utf-8')
1202 for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1203 self.assertRaises(UnicodeDecodeError,
1204 (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1205 self.assertRaises(UnicodeDecodeError,
1206 (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1207 for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1208 self.assertRaises(UnicodeDecodeError,
1209 (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1210 self.assertRaises(UnicodeDecodeError,
1211 (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1212
1213 def test_issue8271(self):
1214 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1215 # only the start byte and the continuation byte(s) are now considered
1216 # invalid, instead of the number of bytes specified by the start byte.
1217 # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1218 # table 3-8, Row 2) for more information about the algorithm used.
1219 FFFD = '\ufffd'
1220 sequences = [
1221 # invalid start bytes
1222 (b'\x80', FFFD), # continuation byte
1223 (b'\x80\x80', FFFD*2), # 2 continuation bytes
1224 (b'\xc0', FFFD),
1225 (b'\xc0\xc0', FFFD*2),
1226 (b'\xc1', FFFD),
1227 (b'\xc1\xc0', FFFD*2),
1228 (b'\xc0\xc1', FFFD*2),
1229 # with start byte of a 2-byte sequence
1230 (b'\xc2', FFFD), # only the start byte
1231 (b'\xc2\xc2', FFFD*2), # 2 start bytes
1232 (b'\xc2\xc2\xc2', FFFD*3), # 2 start bytes
1233 (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1234 # with start byte of a 3-byte sequence
1235 (b'\xe1', FFFD), # only the start byte
1236 (b'\xe1\xe1', FFFD*2), # 2 start bytes
1237 (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1238 (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1239 (b'\xe1\x80', FFFD), # only 1 continuation byte
1240 (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1241 (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1242 (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1243 (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1244 (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1245 (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1246 # with start byte of a 4-byte sequence
1247 (b'\xf1', FFFD), # only the start byte
1248 (b'\xf1\xf1', FFFD*2), # 2 start bytes
1249 (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1250 (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1251 (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1252 (b'\xf1\x80', FFFD), # only 1 continuation bytes
1253 (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1254 (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1255 (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1256 (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1257 (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1258 (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1259 (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1260 (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1261 (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1262 (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1263 (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1264 (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1265 # with invalid start byte of a 4-byte sequence (rfc2279)
1266 (b'\xf5', FFFD), # only the start byte
1267 (b'\xf5\xf5', FFFD*2), # 2 start bytes
1268 (b'\xf5\x80', FFFD*2), # only 1 continuation byte
1269 (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1270 (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1271 (b'\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
1272 (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1273 (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1274 # with invalid start byte of a 5-byte sequence (rfc2279)
1275 (b'\xf8', FFFD), # only the start byte
1276 (b'\xf8\xf8', FFFD*2), # 2 start bytes
1277 (b'\xf8\x80', FFFD*2), # only one continuation byte
1278 (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1279 (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1280 # with invalid start byte of a 6-byte sequence (rfc2279)
1281 (b'\xfc', FFFD), # only the start byte
1282 (b'\xfc\xfc', FFFD*2), # 2 start bytes
1283 (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1284 (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1285 # invalid start byte
1286 (b'\xfe', FFFD),
1287 (b'\xfe\x80\x80', FFFD*3),
1288 # other sequences
1289 (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1290 (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1291 (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1292 (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1293 '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1294 ]
1295 for n, (seq, res) in enumerate(sequences):
1296 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1297 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1298 self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1299 self.assertEqual(seq.decode('utf-8', 'ignore'),
1300 res.replace('\uFFFD', ''))
1301
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00001302 def test_codecs_idna(self):
1303 # Test whether trailing dot is preserved
Walter Dörwald1324c6f2007-05-11 19:57:05 +00001304 self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00001305
Walter Dörwald28256f22003-01-19 16:59:20 +00001306 def test_codecs_errors(self):
1307 # Error handling (encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001308 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
1309 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
Walter Dörwald67e83882007-05-05 12:26:27 +00001310 self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
1311 self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
Benjamin Peterson308d6372009-09-18 21:42:35 +00001312 self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
1313 'Andr\202 x'.encode('ascii', errors='replace'))
1314 self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
1315 'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001316
Walter Dörwald28256f22003-01-19 16:59:20 +00001317 # Error handling (decoding)
Walter Dörwald67e83882007-05-05 12:26:27 +00001318 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
1319 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
1320 self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
1321 self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001322
Walter Dörwald28256f22003-01-19 16:59:20 +00001323 # Error handling (unknown character names)
Guido van Rossum39478e82007-08-27 17:23:59 +00001324 self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001325
Walter Dörwald28256f22003-01-19 16:59:20 +00001326 # Error handling (truncated escape sequence)
Guido van Rossum9c627722007-08-27 18:31:48 +00001327 self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00001328
Guido van Rossum9c627722007-08-27 18:31:48 +00001329 self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
1330 self.assertRaises(TypeError, str, b"hello", "test.unicode2")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001331 self.assertRaises(TypeError, "hello".encode, "test.unicode1")
1332 self.assertRaises(TypeError, "hello".encode, "test.unicode2")
Walter Dörwald28256f22003-01-19 16:59:20 +00001333 # executes PyUnicode_Encode()
1334 import imp
1335 self.assertRaises(
1336 ImportError,
1337 imp.find_module,
1338 "non-existing module",
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001339 ["non-existing dir"]
Walter Dörwald28256f22003-01-19 16:59:20 +00001340 )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00001341
Walter Dörwald28256f22003-01-19 16:59:20 +00001342 # Error handling (wrong arguments)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001343 self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001344
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001345 # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
1346 self.assertRaises(UnicodeError, int, "\ud800")
1347 self.assertRaises(UnicodeError, int, "\udf00")
1348 self.assertRaises(UnicodeError, float, "\ud800")
1349 self.assertRaises(UnicodeError, float, "\udf00")
1350 self.assertRaises(UnicodeError, complex, "\ud800")
1351 self.assertRaises(UnicodeError, complex, "\udf00")
Guido van Rossum97064862000-04-10 13:52:48 +00001352
Walter Dörwald28256f22003-01-19 16:59:20 +00001353 def test_codecs(self):
1354 # Encoding
Walter Dörwald67e83882007-05-05 12:26:27 +00001355 self.assertEqual('hello'.encode('ascii'), b'hello')
1356 self.assertEqual('hello'.encode('utf-7'), b'hello')
1357 self.assertEqual('hello'.encode('utf-8'), b'hello')
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001358 self.assertEqual('hello'.encode('utf-8'), b'hello')
Walter Dörwald67e83882007-05-05 12:26:27 +00001359 self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
1360 self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
1361 self.assertEqual('hello'.encode('latin-1'), b'hello')
Guido van Rossum97064862000-04-10 13:52:48 +00001362
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001363 # Default encoding is utf-8
1364 self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
1365
Walter Dörwald28256f22003-01-19 16:59:20 +00001366 # Roundtrip safety for BMP (just the first 1024 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00001367 for c in range(1024):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00001368 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001369 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
1370 'utf-16-be', 'raw_unicode_escape',
1371 'unicode_escape', 'unicode_internal'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001372 self.assertEqual(str(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001373
Walter Dörwald28256f22003-01-19 16:59:20 +00001374 # Roundtrip safety for BMP (just the first 256 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00001375 for c in range(256):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00001376 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001377 for encoding in ('latin-1',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001378 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001379
Walter Dörwald28256f22003-01-19 16:59:20 +00001380 # Roundtrip safety for BMP (just the first 128 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00001381 for c in range(128):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00001382 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001383 for encoding in ('ascii',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001384 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001385
Walter Dörwald28256f22003-01-19 16:59:20 +00001386 # Roundtrip safety for non-BMP (just a few chars)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001387 u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
Walter Dörwald28256f22003-01-19 16:59:20 +00001388 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
1389 #'raw_unicode_escape',
1390 'unicode_escape', 'unicode_internal'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001391 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001392
Walter Dörwald28256f22003-01-19 16:59:20 +00001393 # UTF-8 must be roundtrip safe for all UCS-2 code points
1394 # This excludes surrogates: in the full range, there would be
1395 # a surrogate pair (\udbff\udc00), which gets converted back
1396 # to a non-BMP character (\U0010fc00)
Walter Dörwald1324c6f2007-05-11 19:57:05 +00001397 u = ''.join(map(chr, list(range(0,0xd800)) +
1398 list(range(0xe000,0x10000))))
Walter Dörwald28256f22003-01-19 16:59:20 +00001399 for encoding in ('utf-8',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001400 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001401
Walter Dörwald28256f22003-01-19 16:59:20 +00001402 def test_codecs_charmap(self):
1403 # 0-127
Guido van Rossum805365e2007-05-07 22:24:25 +00001404 s = bytes(range(128))
Walter Dörwald28256f22003-01-19 16:59:20 +00001405 for encoding in (
1406 'cp037', 'cp1026',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001407 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1408 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Walter Dörwald28256f22003-01-19 16:59:20 +00001409 'cp863', 'cp865', 'cp866',
1410 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1411 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
1412 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
1413 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001414
Walter Dörwald28256f22003-01-19 16:59:20 +00001415 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1416 'cp1256', 'cp1257', 'cp1258',
1417 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001418
Walter Dörwald28256f22003-01-19 16:59:20 +00001419 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
1420 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +00001421
Walter Dörwald28256f22003-01-19 16:59:20 +00001422 ### These have undefined mappings:
1423 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +00001424
Walter Dörwald28256f22003-01-19 16:59:20 +00001425 ### These fail the round-trip:
1426 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +00001427
Walter Dörwald28256f22003-01-19 16:59:20 +00001428 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001429 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001430
Walter Dörwald28256f22003-01-19 16:59:20 +00001431 # 128-255
Guido van Rossum805365e2007-05-07 22:24:25 +00001432 s = bytes(range(128, 256))
Walter Dörwald28256f22003-01-19 16:59:20 +00001433 for encoding in (
1434 'cp037', 'cp1026',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001435 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1436 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Walter Dörwald28256f22003-01-19 16:59:20 +00001437 'cp863', 'cp865', 'cp866',
1438 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1439 'iso8859_2', 'iso8859_4', 'iso8859_5',
1440 'iso8859_9', 'koi8_r', 'latin_1',
1441 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +00001442
Walter Dörwald28256f22003-01-19 16:59:20 +00001443 ### These have undefined mappings:
1444 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1445 #'cp1256', 'cp1257', 'cp1258',
1446 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
1447 #'iso8859_3', 'iso8859_6', 'iso8859_7',
1448 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +00001449
Walter Dörwald28256f22003-01-19 16:59:20 +00001450 ### These fail the round-trip:
1451 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +00001452
Walter Dörwald28256f22003-01-19 16:59:20 +00001453 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001454 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001455
Walter Dörwald28256f22003-01-19 16:59:20 +00001456 def test_concatenation(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001457 self.assertEqual(("abc" "def"), "abcdef")
1458 self.assertEqual(("abc" "def"), "abcdef")
1459 self.assertEqual(("abc" "def"), "abcdef")
1460 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
1461 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +00001462
Walter Dörwald28256f22003-01-19 16:59:20 +00001463 def test_printing(self):
1464 class BitBucket:
1465 def write(self, text):
1466 pass
Fred Drake004d5e62000-10-23 17:22:08 +00001467
Walter Dörwald28256f22003-01-19 16:59:20 +00001468 out = BitBucket()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001469 print('abc', file=out)
1470 print('abc', 'def', file=out)
1471 print('abc', 'def', file=out)
1472 print('abc', 'def', file=out)
1473 print('abc\n', file=out)
1474 print('abc\n', end=' ', file=out)
1475 print('abc\n', end=' ', file=out)
1476 print('def\n', file=out)
1477 print('def\n', file=out)
Fred Drake004d5e62000-10-23 17:22:08 +00001478
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00001479 def test_ucs4(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001480 x = '\U00100000'
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00001481 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
1482 self.assertEqual(x, y)
1483
Florent Xiclunaa87b3832010-09-13 02:28:18 +00001484 y = br'\U00100000'
1485 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
1486 self.assertEqual(x, y)
1487 y = br'\U00010000'
1488 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
1489 self.assertEqual(x, y)
Christian Heimesfe337bf2008-03-23 21:54:12 +00001490
Florent Xiclunaa87b3832010-09-13 02:28:18 +00001491 try:
1492 br'\U11111111'.decode("raw-unicode-escape")
1493 except UnicodeDecodeError as e:
1494 self.assertEqual(e.start, 0)
1495 self.assertEqual(e.end, 10)
1496 else:
1497 self.fail("Should have raised UnicodeDecodeError")
Christian Heimesfe337bf2008-03-23 21:54:12 +00001498
Brett Cannonc3647ac2005-04-26 03:45:26 +00001499 def test_conversion(self):
1500 # Make sure __unicode__() works properly
1501 class Foo0:
1502 def __str__(self):
1503 return "foo"
1504
1505 class Foo1:
Guido van Rossum98297ee2007-11-06 21:34:58 +00001506 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001507 return "foo"
Brett Cannonc3647ac2005-04-26 03:45:26 +00001508
1509 class Foo2(object):
Guido van Rossum98297ee2007-11-06 21:34:58 +00001510 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001511 return "foo"
Brett Cannonc3647ac2005-04-26 03:45:26 +00001512
1513 class Foo3(object):
Guido van Rossum98297ee2007-11-06 21:34:58 +00001514 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001515 return "foo"
1516
1517 class Foo4(str):
Guido van Rossum98297ee2007-11-06 21:34:58 +00001518 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001519 return "foo"
1520
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001521 class Foo5(str):
Guido van Rossum98297ee2007-11-06 21:34:58 +00001522 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001523 return "foo"
1524
1525 class Foo6(str):
1526 def __str__(self):
1527 return "foos"
1528
Guido van Rossum98297ee2007-11-06 21:34:58 +00001529 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001530 return "foou"
Brett Cannonc3647ac2005-04-26 03:45:26 +00001531
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001532 class Foo7(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001533 def __str__(self):
1534 return "foos"
Guido van Rossum98297ee2007-11-06 21:34:58 +00001535 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001536 return "foou"
Brett Cannonc3647ac2005-04-26 03:45:26 +00001537
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001538 class Foo8(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001539 def __new__(cls, content=""):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001540 return str.__new__(cls, 2*content)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001541 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001542 return self
1543
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001544 class Foo9(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001545 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001546 return "not unicode"
1547
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001548 self.assertEqual(str(Foo0()), "foo")
1549 self.assertEqual(str(Foo1()), "foo")
1550 self.assertEqual(str(Foo2()), "foo")
1551 self.assertEqual(str(Foo3()), "foo")
1552 self.assertEqual(str(Foo4("bar")), "foo")
1553 self.assertEqual(str(Foo5("bar")), "foo")
1554 self.assertEqual(str(Foo6("bar")), "foou")
1555 self.assertEqual(str(Foo7("bar")), "foou")
1556 self.assertEqual(str(Foo8("foo")), "foofoo")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001557 self.assertEqual(str(Foo9("foo")), "not unicode")
Brett Cannonc3647ac2005-04-26 03:45:26 +00001558
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001559 def test_unicode_repr(self):
1560 class s1:
1561 def __repr__(self):
1562 return '\\n'
1563
1564 class s2:
1565 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001566 return '\\n'
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001567
1568 self.assertEqual(repr(s1()), '\\n')
1569 self.assertEqual(repr(s2()), '\\n')
1570
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001571 def test_printable_repr(self):
1572 self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001573 self.assertEqual(repr('\U00014000'), "'\\U00014000'") # nonprintable
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001574
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001575 def test_expandtabs_overflows_gracefully(self):
1576 # This test only affects 32-bit platforms because expandtabs can only take
1577 # an int as the max value, not a 64-bit C long. If expandtabs is changed
1578 # to take a 64-bit long, this test should apply to all platforms.
Christian Heimesa37d4c62007-12-04 23:02:19 +00001579 if sys.maxsize > (1 << 32) or struct.calcsize('P') != 4:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001580 return
Christian Heimesa37d4c62007-12-04 23:02:19 +00001581 self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001582
Victor Stinner1d972ad2011-10-07 13:31:46 +02001583 @support.cpython_only
Antoine Pitroue19aa382011-10-04 16:04:01 +02001584 def test_expandtabs_optimization(self):
1585 s = 'abc'
1586 self.assertIs(s.expandtabs(), s)
1587
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00001588 def test_raiseMemError(self):
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589 if struct.calcsize('P') == 8:
1590 # 64 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02001591 ascii_struct_size = 48
1592 compact_struct_size = 72
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001593 else:
1594 # 32 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02001595 ascii_struct_size = 24
1596 compact_struct_size = 36
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597
1598 for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
1599 code = ord(char)
1600 if code < 0x100:
1601 char_size = 1 # sizeof(Py_UCS1)
1602 struct_size = ascii_struct_size
1603 elif code < 0x10000:
1604 char_size = 2 # sizeof(Py_UCS2)
1605 struct_size = compact_struct_size
1606 else:
1607 char_size = 4 # sizeof(Py_UCS4)
1608 struct_size = compact_struct_size
1609 # Note: sys.maxsize is half of the actual max allocation because of
Martin v. Löwis287eca62011-09-28 10:03:28 +02001610 # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
1611 # be allocatable, given enough memory.
1612 maxlen = ((sys.maxsize - struct_size) // char_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001613 alloc = lambda: char * maxlen
1614 self.assertRaises(MemoryError, alloc)
1615 self.assertRaises(MemoryError, alloc)
Antoine Pitrou3db3e872008-08-17 17:06:51 +00001616
Victor Stinner808fc0a2010-03-22 12:50:40 +00001617 def test_format_subclass(self):
1618 class S(str):
1619 def __str__(self):
1620 return '__str__ overridden'
1621 s = S('xxx')
Florent Xiclunaa87b3832010-09-13 02:28:18 +00001622 self.assertEqual("%s" % s, '__str__ overridden')
1623 self.assertEqual("{}".format(s), '__str__ overridden')
Victor Stinner808fc0a2010-03-22 12:50:40 +00001624
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00001625 # Test PyUnicode_FromFormat()
Victor Stinner1205f272010-09-11 00:54:47 +00001626 def test_from_format(self):
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00001627 support.import_module('ctypes')
Victor Stinner6d970f42011-03-02 00:04:25 +00001628 from ctypes import (pythonapi, py_object,
1629 c_int, c_long, c_longlong, c_ssize_t,
1630 c_uint, c_ulong, c_ulonglong, c_size_t)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631 name = "PyUnicode_FromFormat"
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00001632 _PyUnicode_FromFormat = getattr(pythonapi, name)
1633 _PyUnicode_FromFormat.restype = py_object
1634
1635 def PyUnicode_FromFormat(format, *args):
1636 cargs = tuple(
1637 py_object(arg) if isinstance(arg, str) else arg
1638 for arg in args)
1639 return _PyUnicode_FromFormat(format, *cargs)
Victor Stinner1205f272010-09-11 00:54:47 +00001640
1641 # ascii format, non-ascii argument
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00001642 text = PyUnicode_FromFormat(b'ascii\x7f=%U', 'unicode\xe9')
Victor Stinner1205f272010-09-11 00:54:47 +00001643 self.assertEqual(text, 'ascii\x7f=unicode\xe9')
1644
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00001645 # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
1646 # raises an error
Ezio Melottied3a7d22010-12-01 02:32:32 +00001647 self.assertRaisesRegex(ValueError,
Victor Stinner1205f272010-09-11 00:54:47 +00001648 '^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
Victor Stinner4c7db312010-09-12 07:51:18 +00001649 'string, got a non-ASCII byte: 0xe9$',
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00001650 PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00001651
Victor Stinner96865452011-03-01 23:44:09 +00001652 # test "%c"
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001653 self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0xabcd)), '\uabcd')
1654 self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0x10ffff)), '\U0010ffff')
1655
Victor Stinner96865452011-03-01 23:44:09 +00001656 # test "%"
1657 self.assertEqual(PyUnicode_FromFormat(b'%'), '%')
1658 self.assertEqual(PyUnicode_FromFormat(b'%%'), '%')
1659 self.assertEqual(PyUnicode_FromFormat(b'%%s'), '%s')
1660 self.assertEqual(PyUnicode_FromFormat(b'[%%]'), '[%]')
1661 self.assertEqual(PyUnicode_FromFormat(b'%%%s', b'abc'), '%abc')
1662
Victor Stinner6d970f42011-03-02 00:04:25 +00001663 # test integer formats (%i, %d, %u)
Victor Stinner96865452011-03-01 23:44:09 +00001664 self.assertEqual(PyUnicode_FromFormat(b'%03i', c_int(10)), '010')
1665 self.assertEqual(PyUnicode_FromFormat(b'%0.4i', c_int(10)), '0010')
Victor Stinner6d970f42011-03-02 00:04:25 +00001666 self.assertEqual(PyUnicode_FromFormat(b'%i', c_int(-123)), '-123')
1667 self.assertEqual(PyUnicode_FromFormat(b'%li', c_long(-123)), '-123')
1668 self.assertEqual(PyUnicode_FromFormat(b'%lli', c_longlong(-123)), '-123')
1669 self.assertEqual(PyUnicode_FromFormat(b'%zi', c_ssize_t(-123)), '-123')
Victor Stinner96865452011-03-01 23:44:09 +00001670
Victor Stinner6d970f42011-03-02 00:04:25 +00001671 self.assertEqual(PyUnicode_FromFormat(b'%d', c_int(-123)), '-123')
1672 self.assertEqual(PyUnicode_FromFormat(b'%ld', c_long(-123)), '-123')
1673 self.assertEqual(PyUnicode_FromFormat(b'%lld', c_longlong(-123)), '-123')
1674 self.assertEqual(PyUnicode_FromFormat(b'%zd', c_ssize_t(-123)), '-123')
Victor Stinner96865452011-03-01 23:44:09 +00001675
Victor Stinner6d970f42011-03-02 00:04:25 +00001676 self.assertEqual(PyUnicode_FromFormat(b'%u', c_uint(123)), '123')
1677 self.assertEqual(PyUnicode_FromFormat(b'%lu', c_ulong(123)), '123')
1678 self.assertEqual(PyUnicode_FromFormat(b'%llu', c_ulonglong(123)), '123')
1679 self.assertEqual(PyUnicode_FromFormat(b'%zu', c_size_t(123)), '123')
1680
1681 # test %A
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00001682 text = PyUnicode_FromFormat(b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
Victor Stinner9a909002010-10-18 20:59:24 +00001683 self.assertEqual(text, r"%A:'abc\xe9\uabcd\U0010ffff'")
1684
Victor Stinner6d970f42011-03-02 00:04:25 +00001685 # test %V
Victor Stinner2512a8b2011-03-01 22:46:52 +00001686 text = PyUnicode_FromFormat(b'repr=%V', 'abc', b'xyz')
1687 self.assertEqual(text, 'repr=abc')
1688
1689 # Test string decode from parameter of %s using utf-8.
1690 # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
1691 # '\u4eba\u6c11'
1692 text = PyUnicode_FromFormat(b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
1693 self.assertEqual(text, 'repr=\u4eba\u6c11')
1694
1695 #Test replace error handler.
1696 text = PyUnicode_FromFormat(b'repr=%V', None, b'abc\xff')
1697 self.assertEqual(text, 'repr=abc\ufffd')
1698
Victor Stinner6d970f42011-03-02 00:04:25 +00001699 # not supported: copy the raw format string. these tests are just here
1700 # to check for crashs and should not be considered as specifications
1701 self.assertEqual(PyUnicode_FromFormat(b'%1%s', b'abc'), '%s')
1702 self.assertEqual(PyUnicode_FromFormat(b'%1abc'), '%1abc')
1703 self.assertEqual(PyUnicode_FromFormat(b'%+i', c_int(10)), '%+i')
1704 self.assertEqual(PyUnicode_FromFormat(b'%.%s', b'abc'), '%.%s')
1705
Victor Stinner1c24bd02010-10-02 11:03:13 +00001706 # Test PyUnicode_AsWideChar()
1707 def test_aswidechar(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001708 from _testcapi import unicode_aswidechar
Antoine Pitrou0662bc22010-11-22 16:19:04 +00001709 support.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001710 from ctypes import c_wchar, sizeof
1711
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001712 wchar, size = unicode_aswidechar('abcdef', 2)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001713 self.assertEqual(size, 2)
1714 self.assertEqual(wchar, 'ab')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001715
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001716 wchar, size = unicode_aswidechar('abc', 3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001717 self.assertEqual(size, 3)
1718 self.assertEqual(wchar, 'abc')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001719
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001720 wchar, size = unicode_aswidechar('abc', 4)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001721 self.assertEqual(size, 3)
1722 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001723
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001724 wchar, size = unicode_aswidechar('abc', 10)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001725 self.assertEqual(size, 3)
1726 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001727
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001728 wchar, size = unicode_aswidechar('abc\0def', 20)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001729 self.assertEqual(size, 7)
1730 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001731
Victor Stinner5593d8a2010-10-02 11:11:27 +00001732 nonbmp = chr(0x10ffff)
1733 if sizeof(c_wchar) == 2:
1734 buflen = 3
1735 nchar = 2
1736 else: # sizeof(c_wchar) == 4
1737 buflen = 2
1738 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001739 wchar, size = unicode_aswidechar(nonbmp, buflen)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001740 self.assertEqual(size, nchar)
1741 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00001742
Victor Stinner1c24bd02010-10-02 11:03:13 +00001743 # Test PyUnicode_AsWideCharString()
1744 def test_aswidecharstring(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001745 from _testcapi import unicode_aswidecharstring
Antoine Pitrou0662bc22010-11-22 16:19:04 +00001746 support.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001747 from ctypes import c_wchar, sizeof
1748
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001749 wchar, size = unicode_aswidecharstring('abc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001750 self.assertEqual(size, 3)
1751 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001752
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001753 wchar, size = unicode_aswidecharstring('abc\0def')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001754 self.assertEqual(size, 7)
1755 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001756
Victor Stinner5593d8a2010-10-02 11:11:27 +00001757 nonbmp = chr(0x10ffff)
1758 if sizeof(c_wchar) == 2:
1759 nchar = 2
1760 else: # sizeof(c_wchar) == 4
1761 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001762 wchar, size = unicode_aswidecharstring(nonbmp)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001763 self.assertEqual(size, nchar)
1764 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00001765
Benjamin Peterson811c2f12011-09-30 21:31:21 -04001766 def test_subclass_add(self):
1767 class S(str):
1768 def __add__(self, o):
1769 return "3"
1770 self.assertEqual(S("4") + S("5"), "3")
1771 class S(str):
1772 def __iadd__(self, o):
1773 return "3"
1774 s = S("1")
1775 s += "4"
1776 self.assertEqual(s, "3")
1777
Victor Stinner1c24bd02010-10-02 11:03:13 +00001778
Eric Smitha1eac722011-01-29 11:15:35 +00001779class StringModuleTest(unittest.TestCase):
1780 def test_formatter_parser(self):
1781 def parse(format):
1782 return list(_string.formatter_parser(format))
1783
1784 formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
1785 self.assertEqual(formatter, [
1786 ('prefix ', '2', '', 's'),
1787 ('xxx', '0', '^+10.3f', None),
1788 ('', 'obj.attr', '', 's'),
1789 (' ', 'z[0]', '10', 's'),
1790 ])
1791
1792 formatter = parse("prefix {} suffix")
1793 self.assertEqual(formatter, [
1794 ('prefix ', '', '', None),
1795 (' suffix', None, None, None),
1796 ])
1797
1798 formatter = parse("str")
1799 self.assertEqual(formatter, [
1800 ('str', None, None, None),
1801 ])
1802
1803 formatter = parse("")
1804 self.assertEqual(formatter, [])
1805
1806 formatter = parse("{0}")
1807 self.assertEqual(formatter, [
1808 ('', '0', '', None),
1809 ])
1810
1811 self.assertRaises(TypeError, _string.formatter_parser, 1)
1812
1813 def test_formatter_field_name_split(self):
1814 def split(name):
1815 items = list(_string.formatter_field_name_split(name))
1816 items[1] = list(items[1])
1817 return items
1818 self.assertEqual(split("obj"), ["obj", []])
1819 self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
1820 self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
1821 self.assertEqual(split("obj.arg[key1][key2]"), [
1822 "obj",
1823 [(True, 'arg'),
1824 (False, 'key1'),
1825 (False, 'key2'),
1826 ]])
1827 self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
1828
1829
Walter Dörwald28256f22003-01-19 16:59:20 +00001830def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001831 support.run_unittest(__name__)
Barry Warsaw817918c2002-08-06 16:58:21 +00001832
Walter Dörwald28256f22003-01-19 16:59:20 +00001833if __name__ == "__main__":
1834 test_main()