blob: dd1428710d568c5811ee6cb7454968ff891686a6 [file] [log] [blame]
Guido van Rossuma831cac2000-03-10 23:23:21 +00001""" Test script for the Unicode implementation.
2
Guido van Rossuma831cac2000-03-10 23:23:21 +00003Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03007"""
Victor Stinner040e16e2011-11-15 22:44:05 +01008import _string
Guido van Rossum98297ee2007-11-06 21:34:58 +00009import codecs
Victor Stinner9fc59812013-04-08 22:34:43 +020010import itertools
Ethan Furman9ab74802014-03-21 06:38:46 -070011import operator
Guido van Rossum98297ee2007-11-06 21:34:58 +000012import struct
13import sys
Victor Stinner22eb6892019-06-26 00:51:05 +020014import textwrap
Greg Price6bccbe72019-08-14 04:05:19 -070015import unicodedata
Guido van Rossum98297ee2007-11-06 21:34:58 +000016import unittest
17import warnings
Hai Shideb01622020-07-06 20:29:49 +080018from test.support import import_helper
19from test.support import warnings_helper
Benjamin Petersonee8712c2008-05-20 21:35:26 +000020from test import support, string_tests
Victor Stinner22eb6892019-06-26 00:51:05 +020021from test.support.script_helper import assert_python_failure
Guido van Rossuma831cac2000-03-10 23:23:21 +000022
Neal Norwitz430f68b2005-11-24 22:00:56 +000023# Error handling (bad decoder return)
24def search_function(encoding):
25 def decode1(input, errors="strict"):
26 return 42 # not a tuple
27 def encode1(input, errors="strict"):
28 return 42 # not a tuple
29 def encode2(input, errors="strict"):
30 return (42, 42) # no unicode
31 def decode2(input, errors="strict"):
32 return (42, 42) # no unicode
33 if encoding=="test.unicode1":
34 return (encode1, decode1, None, None)
35 elif encoding=="test.unicode2":
36 return (encode2, decode2, None, None)
37 else:
38 return None
Neal Norwitz430f68b2005-11-24 22:00:56 +000039
Victor Stinner9fc59812013-04-08 22:34:43 +020040def duplicate_string(text):
41 """
42 Try to get a fresh clone of the specified text:
43 new object with a reference count of 1.
44
45 This is a best-effort: latin1 single letters and the empty
46 string ('') are singletons and cannot be cloned.
47 """
48 return text.encode().decode()
49
Serhiy Storchaka15095802015-11-25 15:47:01 +020050class StrSubclass(str):
51 pass
52
Brett Cannon226b2302010-03-20 22:22:22 +000053class UnicodeTest(string_tests.CommonTest,
54 string_tests.MixinStrUnicodeUserStringTest,
Ezio Melotti0dceb562013-01-10 07:43:26 +020055 string_tests.MixinStrUnicodeTest,
56 unittest.TestCase):
Brett Cannon226b2302010-03-20 22:22:22 +000057
Guido van Rossumef87d6e2007-05-02 19:09:54 +000058 type2test = str
Walter Dörwald0fd583c2003-02-21 12:53:50 +000059
Hai Shic9f696c2020-10-16 16:34:15 +080060 def setUp(self):
61 codecs.register(search_function)
62 self.addCleanup(codecs.unregister, search_function)
63
Walter Dörwald0fd583c2003-02-21 12:53:50 +000064 def checkequalnofix(self, result, object, methodname, *args):
65 method = getattr(object, methodname)
66 realresult = method(*args)
67 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000068 self.assertTrue(type(realresult) is type(result))
Walter Dörwald0fd583c2003-02-21 12:53:50 +000069
70 # if the original is returned make sure that
71 # this doesn't happen with subclasses
72 if realresult is object:
Guido van Rossumef87d6e2007-05-02 19:09:54 +000073 class usub(str):
Walter Dörwald0fd583c2003-02-21 12:53:50 +000074 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000075 return 'usub(%r)' % str.__repr__(self)
Walter Dörwald0fd583c2003-02-21 12:53:50 +000076 object = usub(object)
77 method = getattr(object, methodname)
78 realresult = method(*args)
79 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000080 self.assertTrue(object is not realresult)
Guido van Rossume4874ae2001-09-21 15:36:41 +000081
Jeremy Hylton504de6b2003-10-06 05:08:26 +000082 def test_literals(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000083 self.assertEqual('\xff', '\u00ff')
84 self.assertEqual('\uffff', '\U0000ffff')
Guido van Rossum36e0a922007-07-20 04:05:57 +000085 self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
86 self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
87 self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
Benjamin Petersoncd76c272008-04-05 15:09:30 +000088 # raw strings should not have unicode escapes
Florent Xiclunaa87b3832010-09-13 02:28:18 +000089 self.assertNotEqual(r"\u0020", " ")
Jeremy Hylton504de6b2003-10-06 05:08:26 +000090
Georg Brandl559e5d72008-06-11 18:37:52 +000091 def test_ascii(self):
92 if not sys.platform.startswith('java'):
93 # Test basic sanity of repr()
94 self.assertEqual(ascii('abc'), "'abc'")
95 self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
96 self.assertEqual(ascii('ab\\'), "'ab\\\\'")
97 self.assertEqual(ascii('\\c'), "'\\\\c'")
98 self.assertEqual(ascii('\\'), "'\\\\'")
99 self.assertEqual(ascii('\n'), "'\\n'")
100 self.assertEqual(ascii('\r'), "'\\r'")
101 self.assertEqual(ascii('\t'), "'\\t'")
102 self.assertEqual(ascii('\b'), "'\\x08'")
103 self.assertEqual(ascii("'\""), """'\\'"'""")
104 self.assertEqual(ascii("'\""), """'\\'"'""")
105 self.assertEqual(ascii("'"), '''"'"''')
106 self.assertEqual(ascii('"'), """'"'""")
107 latin1repr = (
108 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
109 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
110 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
111 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
112 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
113 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
114 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
115 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
116 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
117 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
118 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
119 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
120 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
121 "\\xfe\\xff'")
122 testrepr = ascii(''.join(map(chr, range(256))))
123 self.assertEqual(testrepr, latin1repr)
124 # Test ascii works on wide unicode escapes without overflow.
125 self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
126 ascii("\U00010000" * 39 + "\uffff" * 4096))
127
128 class WrongRepr:
129 def __repr__(self):
130 return b'byte-repr'
131 self.assertRaises(TypeError, ascii, WrongRepr())
132
Walter Dörwald28256f22003-01-19 16:59:20 +0000133 def test_repr(self):
134 if not sys.platform.startswith('java'):
135 # Test basic sanity of repr()
Walter Dörwald67e83882007-05-05 12:26:27 +0000136 self.assertEqual(repr('abc'), "'abc'")
137 self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
138 self.assertEqual(repr('ab\\'), "'ab\\\\'")
139 self.assertEqual(repr('\\c'), "'\\\\c'")
140 self.assertEqual(repr('\\'), "'\\\\'")
141 self.assertEqual(repr('\n'), "'\\n'")
142 self.assertEqual(repr('\r'), "'\\r'")
143 self.assertEqual(repr('\t'), "'\\t'")
144 self.assertEqual(repr('\b'), "'\\x08'")
145 self.assertEqual(repr("'\""), """'\\'"'""")
146 self.assertEqual(repr("'\""), """'\\'"'""")
147 self.assertEqual(repr("'"), '''"'"''')
148 self.assertEqual(repr('"'), """'"'""")
Walter Dörwald28256f22003-01-19 16:59:20 +0000149 latin1repr = (
Walter Dörwald67e83882007-05-05 12:26:27 +0000150 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
Walter Dörwald28256f22003-01-19 16:59:20 +0000151 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
152 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
153 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
154 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
155 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
Georg Brandl559e5d72008-06-11 18:37:52 +0000156 "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
157 "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
158 "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
159 "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
160 "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
161 "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
162 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
163 "\xfe\xff'")
Guido van Rossum805365e2007-05-07 22:24:25 +0000164 testrepr = repr(''.join(map(chr, range(256))))
Walter Dörwald28256f22003-01-19 16:59:20 +0000165 self.assertEqual(testrepr, latin1repr)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000166 # Test repr works on wide unicode escapes without overflow.
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000167 self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
168 repr("\U00010000" * 39 + "\uffff" * 4096))
Walter Dörwald28256f22003-01-19 16:59:20 +0000169
Georg Brandl559e5d72008-06-11 18:37:52 +0000170 class WrongRepr:
171 def __repr__(self):
172 return b'byte-repr'
173 self.assertRaises(TypeError, repr, WrongRepr())
174
Guido van Rossum49d6b072006-08-17 21:11:47 +0000175 def test_iterators(self):
176 # Make sure unicode objects have an __iter__ method
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000177 it = "\u1111\u2222\u3333".__iter__()
178 self.assertEqual(next(it), "\u1111")
179 self.assertEqual(next(it), "\u2222")
180 self.assertEqual(next(it), "\u3333")
Georg Brandla18af4e2007-04-21 15:47:16 +0000181 self.assertRaises(StopIteration, next, it)
Guido van Rossum49d6b072006-08-17 21:11:47 +0000182
Walter Dörwald28256f22003-01-19 16:59:20 +0000183 def test_count(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000184 string_tests.CommonTest.test_count(self)
185 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000186 self.checkequalnofix(3, 'aaa', 'count', 'a')
187 self.checkequalnofix(0, 'aaa', 'count', 'b')
188 self.checkequalnofix(3, 'aaa', 'count', 'a')
189 self.checkequalnofix(0, 'aaa', 'count', 'b')
190 self.checkequalnofix(0, 'aaa', 'count', 'b')
191 self.checkequalnofix(1, 'aaa', 'count', 'a', -1)
192 self.checkequalnofix(3, 'aaa', 'count', 'a', -10)
193 self.checkequalnofix(2, 'aaa', 'count', 'a', 0, -1)
194 self.checkequalnofix(0, 'aaa', 'count', 'a', 0, -10)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200195 # test mixed kinds
196 self.checkequal(10, '\u0102' + 'a' * 10, 'count', 'a')
197 self.checkequal(10, '\U00100304' + 'a' * 10, 'count', 'a')
198 self.checkequal(10, '\U00100304' + '\u0102' * 10, 'count', '\u0102')
199 self.checkequal(0, 'a' * 10, 'count', '\u0102')
200 self.checkequal(0, 'a' * 10, 'count', '\U00100304')
201 self.checkequal(0, '\u0102' * 10, 'count', '\U00100304')
202 self.checkequal(10, '\u0102' + 'a_' * 10, 'count', 'a_')
203 self.checkequal(10, '\U00100304' + 'a_' * 10, 'count', 'a_')
204 self.checkequal(10, '\U00100304' + '\u0102_' * 10, 'count', '\u0102_')
205 self.checkequal(0, 'a' * 10, 'count', 'a\u0102')
206 self.checkequal(0, 'a' * 10, 'count', 'a\U00100304')
207 self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000208
Walter Dörwald28256f22003-01-19 16:59:20 +0000209 def test_find(self):
Antoine Pitrouc0bbe7d2011-10-08 22:41:35 +0200210 string_tests.CommonTest.test_find(self)
Antoine Pitrou2c3b2302011-10-11 20:29:21 +0200211 # test implementation details of the memchr fast path
212 self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102')
213 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201')
214 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120')
215 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220')
216 self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304')
217 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204')
218 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004')
219 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000220 self.checkequalnofix(0, 'abcdefghiabc', 'find', 'abc')
221 self.checkequalnofix(9, 'abcdefghiabc', 'find', 'abc', 1)
222 self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000223
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000224 self.assertRaises(TypeError, 'hello'.find)
225 self.assertRaises(TypeError, 'hello'.find, 42)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200226 # test mixed kinds
227 self.checkequal(100, '\u0102' * 100 + 'a', 'find', 'a')
228 self.checkequal(100, '\U00100304' * 100 + 'a', 'find', 'a')
229 self.checkequal(100, '\U00100304' * 100 + '\u0102', 'find', '\u0102')
230 self.checkequal(-1, 'a' * 100, 'find', '\u0102')
231 self.checkequal(-1, 'a' * 100, 'find', '\U00100304')
232 self.checkequal(-1, '\u0102' * 100, 'find', '\U00100304')
233 self.checkequal(100, '\u0102' * 100 + 'a_', 'find', 'a_')
234 self.checkequal(100, '\U00100304' * 100 + 'a_', 'find', 'a_')
235 self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'find', '\u0102_')
236 self.checkequal(-1, 'a' * 100, 'find', 'a\u0102')
237 self.checkequal(-1, 'a' * 100, 'find', 'a\U00100304')
238 self.checkequal(-1, '\u0102' * 100, 'find', '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000239
Walter Dörwald28256f22003-01-19 16:59:20 +0000240 def test_rfind(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000241 string_tests.CommonTest.test_rfind(self)
Antoine Pitrou2c3b2302011-10-11 20:29:21 +0200242 # test implementation details of the memrchr fast path
243 self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102')
244 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201')
245 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120')
246 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220')
247 self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304')
248 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204')
249 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000250 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000251 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', 'abc')
252 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
253 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200254 # test mixed kinds
255 self.checkequal(0, 'a' + '\u0102' * 100, 'rfind', 'a')
256 self.checkequal(0, 'a' + '\U00100304' * 100, 'rfind', 'a')
257 self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rfind', '\u0102')
258 self.checkequal(-1, 'a' * 100, 'rfind', '\u0102')
259 self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304')
260 self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304')
261 self.checkequal(0, '_a' + '\u0102' * 100, 'rfind', '_a')
262 self.checkequal(0, '_a' + '\U00100304' * 100, 'rfind', '_a')
263 self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rfind', '_\u0102')
264 self.checkequal(-1, 'a' * 100, 'rfind', '\u0102a')
265 self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304a')
266 self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304\u0102')
Guido van Rossum8b264542000-12-19 02:22:31 +0000267
Walter Dörwald28256f22003-01-19 16:59:20 +0000268 def test_index(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000269 string_tests.CommonTest.test_index(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000270 self.checkequalnofix(0, 'abcdefghiabc', 'index', '')
271 self.checkequalnofix(3, 'abcdefghiabc', 'index', 'def')
272 self.checkequalnofix(0, 'abcdefghiabc', 'index', 'abc')
273 self.checkequalnofix(9, 'abcdefghiabc', 'index', 'abc', 1)
274 self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
275 self.assertRaises(ValueError, 'abcdefghiab'.index, 'abc', 1)
276 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', 8)
277 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', -1)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200278 # test mixed kinds
279 self.checkequal(100, '\u0102' * 100 + 'a', 'index', 'a')
280 self.checkequal(100, '\U00100304' * 100 + 'a', 'index', 'a')
281 self.checkequal(100, '\U00100304' * 100 + '\u0102', 'index', '\u0102')
282 self.assertRaises(ValueError, ('a' * 100).index, '\u0102')
283 self.assertRaises(ValueError, ('a' * 100).index, '\U00100304')
284 self.assertRaises(ValueError, ('\u0102' * 100).index, '\U00100304')
285 self.checkequal(100, '\u0102' * 100 + 'a_', 'index', 'a_')
286 self.checkequal(100, '\U00100304' * 100 + 'a_', 'index', 'a_')
287 self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'index', '\u0102_')
288 self.assertRaises(ValueError, ('a' * 100).index, 'a\u0102')
289 self.assertRaises(ValueError, ('a' * 100).index, 'a\U00100304')
290 self.assertRaises(ValueError, ('\u0102' * 100).index, '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000291
Walter Dörwald28256f22003-01-19 16:59:20 +0000292 def test_rindex(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000293 string_tests.CommonTest.test_rindex(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000294 self.checkequalnofix(12, 'abcdefghiabc', 'rindex', '')
295 self.checkequalnofix(3, 'abcdefghiabc', 'rindex', 'def')
296 self.checkequalnofix(9, 'abcdefghiabc', 'rindex', 'abc')
297 self.checkequalnofix(0, 'abcdefghiabc', 'rindex', 'abc', 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000298
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000299 self.assertRaises(ValueError, 'abcdefghiabc'.rindex, 'hib')
300 self.assertRaises(ValueError, 'defghiabc'.rindex, 'def', 1)
301 self.assertRaises(ValueError, 'defghiabc'.rindex, 'abc', 0, -1)
302 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, 8)
303 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, -1)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200304 # test mixed kinds
305 self.checkequal(0, 'a' + '\u0102' * 100, 'rindex', 'a')
306 self.checkequal(0, 'a' + '\U00100304' * 100, 'rindex', 'a')
307 self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rindex', '\u0102')
308 self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102')
309 self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304')
310 self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304')
311 self.checkequal(0, '_a' + '\u0102' * 100, 'rindex', '_a')
312 self.checkequal(0, '_a' + '\U00100304' * 100, 'rindex', '_a')
313 self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rindex', '_\u0102')
314 self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102a')
315 self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304a')
316 self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304\u0102')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000317
Georg Brandlceee0772007-11-27 23:48:05 +0000318 def test_maketrans_translate(self):
319 # these work with plain translate()
320 self.checkequalnofix('bbbc', 'abababc', 'translate',
321 {ord('a'): None})
322 self.checkequalnofix('iiic', 'abababc', 'translate',
323 {ord('a'): None, ord('b'): ord('i')})
324 self.checkequalnofix('iiix', 'abababc', 'translate',
325 {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
326 self.checkequalnofix('c', 'abababc', 'translate',
327 {ord('a'): None, ord('b'): ''})
328 self.checkequalnofix('xyyx', 'xzx', 'translate',
329 {ord('z'): 'yy'})
Victor Stinner5a29f252014-04-05 00:17:51 +0200330
Georg Brandlceee0772007-11-27 23:48:05 +0000331 # this needs maketrans()
332 self.checkequalnofix('abababc', 'abababc', 'translate',
333 {'b': '<i>'})
334 tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
335 self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
336 # test alternative way of calling maketrans()
337 tbl = self.type2test.maketrans('abc', 'xyz', 'd')
338 self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
339
Victor Stinner5a29f252014-04-05 00:17:51 +0200340 # various tests switching from ASCII to latin1 or the opposite;
341 # same length, remove a letter, or replace with a longer string.
342 self.assertEqual("[a]".translate(str.maketrans('a', 'X')),
343 "[X]")
344 self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})),
345 "[X]")
346 self.assertEqual("[a]".translate(str.maketrans({'a': None})),
347 "[]")
348 self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})),
349 "[XXX]")
350 self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})),
351 "[\xe9]")
Victor Stinner33798672016-03-01 21:59:58 +0100352 self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '123'})),
353 "x123")
354 self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '\xe9'})),
355 "x\xe9")
356
357 # test non-ASCII (don't take the fast-path)
Victor Stinner5a29f252014-04-05 00:17:51 +0200358 self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})),
359 "[<\xe9>]")
360 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})),
361 "[a]")
362 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})),
363 "[]")
Victor Stinner33798672016-03-01 21:59:58 +0100364 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': '123'})),
365 "[123]")
366 self.assertEqual("[a\xe9]".translate(str.maketrans({'a': '<\u20ac>'})),
367 "[<\u20ac>\xe9]")
Victor Stinner5a29f252014-04-05 00:17:51 +0200368
Victor Stinner4ff33af2014-04-05 11:56:37 +0200369 # invalid Unicode characters
370 invalid_char = 0x10ffff+1
371 for before in "a\xe9\u20ac\U0010ffff":
372 mapping = str.maketrans({before: invalid_char})
373 text = "[%s]" % before
374 self.assertRaises(ValueError, text.translate, mapping)
375
376 # errors
Georg Brandlceee0772007-11-27 23:48:05 +0000377 self.assertRaises(TypeError, self.type2test.maketrans)
378 self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
379 self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
380 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
381 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
382 self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
383 self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000384
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000385 self.assertRaises(TypeError, 'hello'.translate)
Walter Dörwald67e83882007-05-05 12:26:27 +0000386 self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000387
Walter Dörwald28256f22003-01-19 16:59:20 +0000388 def test_split(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000389 string_tests.CommonTest.test_split(self)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000390
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200391 # test mixed kinds
392 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
393 left *= 9
394 right *= 9
395 for delim in ('c', '\u0102', '\U00010302'):
396 self.checkequal([left + right],
397 left + right, 'split', delim)
398 self.checkequal([left, right],
399 left + delim + right, 'split', delim)
400 self.checkequal([left + right],
401 left + right, 'split', delim * 2)
402 self.checkequal([left, right],
403 left + delim * 2 + right, 'split', delim *2)
404
405 def test_rsplit(self):
406 string_tests.CommonTest.test_rsplit(self)
407 # test mixed kinds
408 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
409 left *= 9
410 right *= 9
411 for delim in ('c', '\u0102', '\U00010302'):
412 self.checkequal([left + right],
413 left + right, 'rsplit', delim)
414 self.checkequal([left, right],
415 left + delim + right, 'rsplit', delim)
416 self.checkequal([left + right],
417 left + right, 'rsplit', delim * 2)
418 self.checkequal([left, right],
419 left + delim * 2 + right, 'rsplit', delim *2)
420
421 def test_partition(self):
422 string_tests.MixinStrUnicodeUserStringTest.test_partition(self)
423 # test mixed kinds
Serhiy Storchaka48070c12015-03-29 19:21:02 +0300424 self.checkequal(('ABCDEFGH', '', ''), 'ABCDEFGH', 'partition', '\u4200')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200425 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
426 left *= 9
427 right *= 9
428 for delim in ('c', '\u0102', '\U00010302'):
429 self.checkequal((left + right, '', ''),
430 left + right, 'partition', delim)
431 self.checkequal((left, delim, right),
432 left + delim + right, 'partition', delim)
433 self.checkequal((left + right, '', ''),
434 left + right, 'partition', delim * 2)
435 self.checkequal((left, delim * 2, right),
436 left + delim * 2 + right, 'partition', delim * 2)
437
438 def test_rpartition(self):
439 string_tests.MixinStrUnicodeUserStringTest.test_rpartition(self)
440 # test mixed kinds
Serhiy Storchaka48070c12015-03-29 19:21:02 +0300441 self.checkequal(('', '', 'ABCDEFGH'), 'ABCDEFGH', 'rpartition', '\u4200')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200442 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
443 left *= 9
444 right *= 9
445 for delim in ('c', '\u0102', '\U00010302'):
446 self.checkequal(('', '', left + right),
447 left + right, 'rpartition', delim)
448 self.checkequal((left, delim, right),
449 left + delim + right, 'rpartition', delim)
450 self.checkequal(('', '', left + right),
451 left + right, 'rpartition', delim * 2)
452 self.checkequal((left, delim * 2, right),
453 left + delim * 2 + right, 'rpartition', delim * 2)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000454
Walter Dörwald28256f22003-01-19 16:59:20 +0000455 def test_join(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000456 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000457
Guido van Rossumf1044292007-09-27 18:01:22 +0000458 class MyWrapper:
459 def __init__(self, sval): self.sval = sval
460 def __str__(self): return self.sval
461
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000462 # mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000463 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
464 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
465 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
466 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
467 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
468 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
469 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
Guido van Rossum98297ee2007-11-06 21:34:58 +0000470 self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
471 self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
472 self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
473 self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
Marc-André Lemburge5034372000-08-08 08:04:29 +0000474
Martin Panterb71c0952017-01-12 11:54:59 +0000475 @unittest.skipIf(sys.maxsize > 2**32,
476 'needs too much memory on a 64-bit platform')
477 def test_join_overflow(self):
478 size = int(sys.maxsize**0.5) + 1
479 seq = ('A' * size,) * size
480 self.assertRaises(OverflowError, ''.join, seq)
481
Walter Dörwald28256f22003-01-19 16:59:20 +0000482 def test_replace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000483 string_tests.CommonTest.test_replace(self)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000484
Walter Dörwald28256f22003-01-19 16:59:20 +0000485 # method call forwarded from str implementation because of unicode argument
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000486 self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
487 self.assertRaises(TypeError, 'replace'.replace, "r", 42)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200488 # test mixed kinds
489 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
490 left *= 9
491 right *= 9
492 for delim in ('c', '\u0102', '\U00010302'):
493 for repl in ('d', '\u0103', '\U00010303'):
494 self.checkequal(left + right,
495 left + right, 'replace', delim, repl)
496 self.checkequal(left + repl + right,
497 left + delim + right,
498 'replace', delim, repl)
499 self.checkequal(left + right,
500 left + right, 'replace', delim * 2, repl)
501 self.checkequal(left + repl + right,
502 left + delim * 2 + right,
503 'replace', delim * 2, repl)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000504
Victor Stinner59de0ee2011-10-07 10:01:28 +0200505 @support.cpython_only
506 def test_replace_id(self):
Victor Stinner1d972ad2011-10-07 13:31:46 +0200507 pattern = 'abc'
508 text = 'abc def'
509 self.assertIs(text.replace(pattern, pattern), text)
Victor Stinner59de0ee2011-10-07 10:01:28 +0200510
Miss Islington (bot)45a97d92021-08-13 04:04:08 -0700511 def test_repeat_id_preserving(self):
512 a = '123abc1@'
513 b = '456zyx-+'
514 self.assertEqual(id(a), id(a))
515 self.assertNotEqual(id(a), id(b))
516 self.assertNotEqual(id(a), id(a * -4))
517 self.assertNotEqual(id(a), id(a * 0))
518 self.assertEqual(id(a), id(a * 1))
519 self.assertEqual(id(a), id(1 * a))
520 self.assertNotEqual(id(a), id(a * 2))
521
522 class SubStr(str):
523 pass
524
525 s = SubStr('qwerty()')
526 self.assertEqual(id(s), id(s))
527 self.assertNotEqual(id(s), id(s * -4))
528 self.assertNotEqual(id(s), id(s * 0))
529 self.assertNotEqual(id(s), id(s * 1))
530 self.assertNotEqual(id(s), id(1 * s))
531 self.assertNotEqual(id(s), id(s * 2))
532
Guido van Rossum98297ee2007-11-06 21:34:58 +0000533 def test_bytes_comparison(self):
Hai Shideb01622020-07-06 20:29:49 +0800534 with warnings_helper.check_warnings():
Brett Cannon226b2302010-03-20 22:22:22 +0000535 warnings.simplefilter('ignore', BytesWarning)
536 self.assertEqual('abc' == b'abc', False)
537 self.assertEqual('abc' != b'abc', True)
538 self.assertEqual('abc' == bytearray(b'abc'), False)
539 self.assertEqual('abc' != bytearray(b'abc'), True)
Brett Cannon40430012007-10-22 20:24:51 +0000540
Walter Dörwald28256f22003-01-19 16:59:20 +0000541 def test_comparison(self):
542 # Comparisons:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000543 self.assertEqual('abc', 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000544 self.assertTrue('abcd' > 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000545 self.assertTrue('abc' < 'abcd')
Walter Dörwald28256f22003-01-19 16:59:20 +0000546
547 if 0:
548 # Move these tests to a Unicode collation module test...
549 # Testing UTF-16 code point order comparisons...
550
551 # No surrogates, no fixup required.
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000552 self.assertTrue('\u0061' < '\u20ac')
Walter Dörwald28256f22003-01-19 16:59:20 +0000553 # Non surrogate below surrogate value, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000554 self.assertTrue('\u0061' < '\ud800\udc02')
Walter Dörwald28256f22003-01-19 16:59:20 +0000555
556 # Non surrogate above surrogate value, fixup required
557 def test_lecmp(s, s2):
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000558 self.assertTrue(s < s2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000559
560 def test_fixup(s):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000561 s2 = '\ud800\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000562 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000563 s2 = '\ud900\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000564 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000565 s2 = '\uda00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000566 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000567 s2 = '\udb00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000568 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000569 s2 = '\ud800\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000570 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000571 s2 = '\ud900\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000572 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000573 s2 = '\uda00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000574 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000575 s2 = '\udb00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000576 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000577 s2 = '\ud800\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000578 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000579 s2 = '\ud900\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000580 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000581 s2 = '\uda00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000582 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000583 s2 = '\udb00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000584 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000585 s2 = '\ud800\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000586 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000587 s2 = '\ud900\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000588 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000589 s2 = '\uda00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000590 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000591 s2 = '\udb00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000592 test_lecmp(s, s2)
593
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000594 test_fixup('\ue000')
595 test_fixup('\uff61')
Walter Dörwald28256f22003-01-19 16:59:20 +0000596
597 # Surrogates on both sides, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000598 self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
Walter Dörwald28256f22003-01-19 16:59:20 +0000599
Walter Dörwald28256f22003-01-19 16:59:20 +0000600 def test_islower(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000601 super().test_islower()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000602 self.checkequalnofix(False, '\u1FFc', 'islower')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500603 self.assertFalse('\u2167'.islower())
604 self.assertTrue('\u2177'.islower())
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300605 # non-BMP, uppercase
606 self.assertFalse('\U00010401'.islower())
607 self.assertFalse('\U00010427'.islower())
608 # non-BMP, lowercase
609 self.assertTrue('\U00010429'.islower())
610 self.assertTrue('\U0001044E'.islower())
611 # non-BMP, non-cased
612 self.assertFalse('\U0001F40D'.islower())
613 self.assertFalse('\U0001F46F'.islower())
Walter Dörwald28256f22003-01-19 16:59:20 +0000614
615 def test_isupper(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000616 super().test_isupper()
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000617 if not sys.platform.startswith('java'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000618 self.checkequalnofix(False, '\u1FFc', 'isupper')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500619 self.assertTrue('\u2167'.isupper())
620 self.assertFalse('\u2177'.isupper())
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300621 # non-BMP, uppercase
622 self.assertTrue('\U00010401'.isupper())
623 self.assertTrue('\U00010427'.isupper())
624 # non-BMP, lowercase
625 self.assertFalse('\U00010429'.isupper())
626 self.assertFalse('\U0001044E'.isupper())
627 # non-BMP, non-cased
628 self.assertFalse('\U0001F40D'.isupper())
629 self.assertFalse('\U0001F46F'.isupper())
Walter Dörwald28256f22003-01-19 16:59:20 +0000630
631 def test_istitle(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000632 super().test_istitle()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000633 self.checkequalnofix(True, '\u1FFc', 'istitle')
634 self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
Walter Dörwald28256f22003-01-19 16:59:20 +0000635
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300636 # non-BMP, uppercase + lowercase
637 self.assertTrue('\U00010401\U00010429'.istitle())
638 self.assertTrue('\U00010427\U0001044E'.istitle())
639 # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
640 for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
641 self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
642
Walter Dörwald28256f22003-01-19 16:59:20 +0000643 def test_isspace(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000644 super().test_isspace()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000645 self.checkequalnofix(True, '\u2000', 'isspace')
646 self.checkequalnofix(True, '\u200a', 'isspace')
647 self.checkequalnofix(False, '\u2014', 'isspace')
Greg Price6bccbe72019-08-14 04:05:19 -0700648 # There are no non-BMP whitespace chars as of Unicode 12.
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300649 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
650 '\U0001F40D', '\U0001F46F']:
651 self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
652
Greg Price6bccbe72019-08-14 04:05:19 -0700653 @support.requires_resource('cpu')
654 def test_isspace_invariant(self):
655 for codepoint in range(sys.maxunicode + 1):
656 char = chr(codepoint)
657 bidirectional = unicodedata.bidirectional(char)
658 category = unicodedata.category(char)
659 self.assertEqual(char.isspace(),
660 (bidirectional in ('WS', 'B', 'S')
661 or category == 'Zs'))
662
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300663 def test_isalnum(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000664 super().test_isalnum()
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300665 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
666 '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
667 self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
Walter Dörwald28256f22003-01-19 16:59:20 +0000668
669 def test_isalpha(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000670 super().test_isalpha()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000671 self.checkequalnofix(True, '\u1FFc', 'isalpha')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300672 # non-BMP, cased
673 self.assertTrue('\U00010401'.isalpha())
674 self.assertTrue('\U00010427'.isalpha())
675 self.assertTrue('\U00010429'.isalpha())
676 self.assertTrue('\U0001044E'.isalpha())
677 # non-BMP, non-cased
678 self.assertFalse('\U0001F40D'.isalpha())
679 self.assertFalse('\U0001F46F'.isalpha())
Walter Dörwald28256f22003-01-19 16:59:20 +0000680
INADA Naokia49ac992018-01-27 14:06:21 +0900681 def test_isascii(self):
682 super().test_isascii()
683 self.assertFalse("\u20ac".isascii())
684 self.assertFalse("\U0010ffff".isascii())
685
Walter Dörwald28256f22003-01-19 16:59:20 +0000686 def test_isdecimal(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000687 self.checkequalnofix(False, '', 'isdecimal')
688 self.checkequalnofix(False, 'a', 'isdecimal')
689 self.checkequalnofix(True, '0', 'isdecimal')
690 self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
691 self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
692 self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
693 self.checkequalnofix(True, '0123456789', 'isdecimal')
694 self.checkequalnofix(False, '0123456789a', 'isdecimal')
Walter Dörwald28256f22003-01-19 16:59:20 +0000695
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000696 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000697
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300698 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
699 '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
700 self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
701 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
702 self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
703
Walter Dörwald28256f22003-01-19 16:59:20 +0000704 def test_isdigit(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000705 super().test_isdigit()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000706 self.checkequalnofix(True, '\u2460', 'isdigit')
707 self.checkequalnofix(False, '\xbc', 'isdigit')
708 self.checkequalnofix(True, '\u0660', 'isdigit')
Walter Dörwald28256f22003-01-19 16:59:20 +0000709
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300710 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
711 '\U0001F40D', '\U0001F46F', '\U00011065']:
712 self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
713 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
714 self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
715
Walter Dörwald28256f22003-01-19 16:59:20 +0000716 def test_isnumeric(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000717 self.checkequalnofix(False, '', 'isnumeric')
718 self.checkequalnofix(False, 'a', 'isnumeric')
719 self.checkequalnofix(True, '0', 'isnumeric')
720 self.checkequalnofix(True, '\u2460', 'isnumeric')
721 self.checkequalnofix(True, '\xbc', 'isnumeric')
722 self.checkequalnofix(True, '\u0660', 'isnumeric')
723 self.checkequalnofix(True, '0123456789', 'isnumeric')
724 self.checkequalnofix(False, '0123456789a', 'isnumeric')
Walter Dörwald28256f22003-01-19 16:59:20 +0000725
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000726 self.assertRaises(TypeError, "abc".isnumeric, 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000727
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300728 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
729 '\U0001F40D', '\U0001F46F']:
730 self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
731 for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
732 '\U000104A0', '\U0001F107']:
733 self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
734
Martin v. Löwis47383402007-08-15 07:32:56 +0000735 def test_isidentifier(self):
736 self.assertTrue("a".isidentifier())
737 self.assertTrue("Z".isidentifier())
738 self.assertTrue("_".isidentifier())
739 self.assertTrue("b0".isidentifier())
740 self.assertTrue("bc".isidentifier())
741 self.assertTrue("b_".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000742 self.assertTrue("µ".isidentifier())
Benjamin Petersonf413b802011-08-12 22:17:18 -0500743 self.assertTrue("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000744
745 self.assertFalse(" ".isidentifier())
746 self.assertFalse("[".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000747 self.assertFalse("©".isidentifier())
Georg Brandld52429f2008-07-04 15:55:02 +0000748 self.assertFalse("0".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000749
Serhiy Storchaka5650e762020-05-12 16:18:00 +0300750 @support.cpython_only
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +0300751 @support.requires_legacy_unicode_capi
Serhiy Storchaka5650e762020-05-12 16:18:00 +0300752 def test_isidentifier_legacy(self):
753 import _testcapi
754 u = '𝖀𝖓𝖎𝖈𝖔𝖉𝖊'
755 self.assertTrue(u.isidentifier())
Hai Shideb01622020-07-06 20:29:49 +0800756 with warnings_helper.check_warnings():
Inada Naoki038dd0f2020-06-30 15:26:56 +0900757 warnings.simplefilter('ignore', DeprecationWarning)
758 self.assertTrue(_testcapi.unicode_legacy_string(u).isidentifier())
Serhiy Storchaka5650e762020-05-12 16:18:00 +0300759
Georg Brandl559e5d72008-06-11 18:37:52 +0000760 def test_isprintable(self):
761 self.assertTrue("".isprintable())
Benjamin Peterson09832742009-03-26 17:15:46 +0000762 self.assertTrue(" ".isprintable())
Georg Brandl559e5d72008-06-11 18:37:52 +0000763 self.assertTrue("abcdefg".isprintable())
764 self.assertFalse("abcdefg\n".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000765 # some defined Unicode character
766 self.assertTrue("\u0374".isprintable())
767 # undefined character
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +0000768 self.assertFalse("\u0378".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000769 # single surrogate character
Georg Brandl559e5d72008-06-11 18:37:52 +0000770 self.assertFalse("\ud800".isprintable())
771
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300772 self.assertTrue('\U0001F46F'.isprintable())
773 self.assertFalse('\U000E0020'.isprintable())
774
775 def test_surrogates(self):
776 for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
777 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
778 self.assertTrue(s.islower())
779 self.assertFalse(s.isupper())
780 self.assertFalse(s.istitle())
781 for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
782 'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
783 self.assertFalse(s.islower())
784 self.assertTrue(s.isupper())
785 self.assertTrue(s.istitle())
786
787 for meth_name in ('islower', 'isupper', 'istitle'):
788 meth = getattr(str, meth_name)
789 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
790 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
791
792 for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
793 'isdecimal', 'isnumeric',
794 'isidentifier', 'isprintable'):
795 meth = getattr(str, meth_name)
796 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
797 'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
798 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
799 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
800
801
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300802 def test_lower(self):
803 string_tests.CommonTest.test_lower(self)
804 self.assertEqual('\U00010427'.lower(), '\U0001044F')
805 self.assertEqual('\U00010427\U00010427'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300806 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300807 self.assertEqual('\U00010427\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300808 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300809 self.assertEqual('X\U00010427x\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300810 'x\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500811 self.assertEqual('fi'.lower(), 'fi')
812 self.assertEqual('\u0130'.lower(), '\u0069\u0307')
813 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
814 self.assertEqual('\u03a3'.lower(), '\u03c3')
815 self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
816 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
817 self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
818 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
819 self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
820 self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
821 self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
822 self.assertEqual('\u2177'.lower(), '\u2177')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300823
Benjamin Petersond5890c82012-01-14 13:23:30 -0500824 def test_casefold(self):
825 self.assertEqual('hello'.casefold(), 'hello')
826 self.assertEqual('hELlo'.casefold(), 'hello')
827 self.assertEqual('ß'.casefold(), 'ss')
828 self.assertEqual('fi'.casefold(), 'fi')
829 self.assertEqual('\u03a3'.casefold(), '\u03c3')
830 self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
Benjamin Peterson4eda9372012-08-05 15:05:34 -0700831 self.assertEqual('\u00b5'.casefold(), '\u03bc')
Benjamin Petersond5890c82012-01-14 13:23:30 -0500832
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300833 def test_upper(self):
834 string_tests.CommonTest.test_upper(self)
835 self.assertEqual('\U0001044F'.upper(), '\U00010427')
836 self.assertEqual('\U0001044F\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300837 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300838 self.assertEqual('\U00010427\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300839 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300840 self.assertEqual('X\U00010427x\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300841 'X\U00010427X\U00010427')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500842 self.assertEqual('fi'.upper(), 'FI')
843 self.assertEqual('\u0130'.upper(), '\u0130')
844 self.assertEqual('\u03a3'.upper(), '\u03a3')
845 self.assertEqual('ß'.upper(), 'SS')
846 self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
847 self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
848 self.assertEqual('\u2177'.upper(), '\u2167')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300849
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300850 def test_capitalize(self):
851 string_tests.CommonTest.test_capitalize(self)
852 self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
853 self.assertEqual('\U0001044F\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300854 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300855 self.assertEqual('\U00010427\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300856 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300857 self.assertEqual('\U0001044F\U00010427'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300858 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300859 self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300860 'X\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500861 self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
862 exp = '\u0399\u0308\u0300\u0069\u0307'
863 self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
Kingsley Mb015fc82019-04-12 16:35:39 +0100864 self.assertEqual('finnish'.capitalize(), 'Finnish')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500865 self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300866
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300867 def test_title(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000868 super().test_title()
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300869 self.assertEqual('\U0001044F'.title(), '\U00010427')
870 self.assertEqual('\U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300871 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300872 self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300873 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300874 self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300875 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300876 self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300877 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300878 self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300879 'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500880 self.assertEqual('fiNNISH'.title(), 'Finnish')
881 self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
882 self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300883
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300884 def test_swapcase(self):
885 string_tests.CommonTest.test_swapcase(self)
886 self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
887 self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
888 self.assertEqual('\U0001044F\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300889 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300890 self.assertEqual('\U00010427\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300891 '\U0001044F\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300892 self.assertEqual('\U0001044F\U00010427'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300893 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300894 self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300895 'x\U0001044FX\U00010427')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500896 self.assertEqual('fi'.swapcase(), 'FI')
897 self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
898 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
899 self.assertEqual('\u03a3'.swapcase(), '\u03c3')
900 self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
901 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
902 self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
903 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
904 self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
905 self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
906 self.assertEqual('\u03a3'.swapcase(), '\u03c3')
907 self.assertEqual('ß'.swapcase(), 'SS')
908 self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300909
Ezio Melottif84e01d2013-07-08 17:48:29 +0200910 def test_center(self):
911 string_tests.CommonTest.test_center(self)
912 self.assertEqual('x'.center(2, '\U0010FFFF'),
913 'x\U0010FFFF')
914 self.assertEqual('x'.center(3, '\U0010FFFF'),
915 '\U0010FFFFx\U0010FFFF')
916 self.assertEqual('x'.center(4, '\U0010FFFF'),
917 '\U0010FFFFx\U0010FFFF\U0010FFFF')
918
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400919 @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system")
Benjamin Peterson4d856892014-10-15 13:39:46 -0400920 @support.cpython_only
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400921 def test_case_operation_overflow(self):
922 # Issue #22643
Serhiy Storchaka411dfd82015-11-07 16:54:48 +0200923 size = 2**32//12 + 1
924 try:
925 s = "ü" * size
926 except MemoryError:
927 self.skipTest('no enough memory (%.0f MiB required)' % (size / 2**20))
928 try:
929 self.assertRaises(OverflowError, s.upper)
930 finally:
931 del s
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400932
Walter Dörwald28256f22003-01-19 16:59:20 +0000933 def test_contains(self):
934 # Testing Unicode contains method
Benjamin Peterson577473f2010-01-19 00:09:57 +0000935 self.assertIn('a', 'abdb')
936 self.assertIn('a', 'bdab')
937 self.assertIn('a', 'bdaba')
938 self.assertIn('a', 'bdba')
939 self.assertNotIn('a', 'bdb')
940 self.assertIn('a', 'bdba')
941 self.assertIn('a', ('a',1,None))
942 self.assertIn('a', (1,None,'a'))
943 self.assertIn('a', ('a',1,None))
944 self.assertIn('a', (1,None,'a'))
945 self.assertNotIn('a', ('x',1,'y'))
946 self.assertNotIn('a', ('x',1,None))
947 self.assertNotIn('abcd', 'abcxxxx')
948 self.assertIn('ab', 'abcd')
949 self.assertIn('ab', 'abc')
950 self.assertIn('ab', (1,None,'ab'))
951 self.assertIn('', 'abc')
952 self.assertIn('', '')
953 self.assertIn('', 'abc')
954 self.assertNotIn('\0', 'abc')
955 self.assertIn('\0', '\0abc')
956 self.assertIn('\0', 'abc\0')
957 self.assertIn('a', '\0abc')
958 self.assertIn('asdf', 'asdf')
959 self.assertNotIn('asdf', 'asd')
960 self.assertNotIn('asdf', '')
Walter Dörwald28256f22003-01-19 16:59:20 +0000961
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000962 self.assertRaises(TypeError, "abc".__contains__)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200963 # test mixed kinds
964 for fill in ('a', '\u0100', '\U00010300'):
965 fill *= 9
966 for delim in ('c', '\u0102', '\U00010302'):
967 self.assertNotIn(delim, fill)
968 self.assertIn(delim, fill + delim)
969 self.assertNotIn(delim * 2, fill)
970 self.assertIn(delim * 2, fill + delim * 2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000971
Serhiy Storchaka31b1c8b2013-06-12 09:20:44 +0300972 def test_issue18183(self):
973 '\U00010000\U00100000'.lower()
974 '\U00010000\U00100000'.casefold()
975 '\U00010000\U00100000'.upper()
976 '\U00010000\U00100000'.capitalize()
977 '\U00010000\U00100000'.title()
978 '\U00010000\U00100000'.swapcase()
979 '\U00100000'.center(3, '\U00010000')
980 '\U00100000'.ljust(3, '\U00010000')
981 '\U00100000'.rjust(3, '\U00010000')
982
Eric Smith8c663262007-08-25 02:26:07 +0000983 def test_format(self):
984 self.assertEqual(''.format(), '')
985 self.assertEqual('a'.format(), 'a')
986 self.assertEqual('ab'.format(), 'ab')
987 self.assertEqual('a{{'.format(), 'a{')
988 self.assertEqual('a}}'.format(), 'a}')
989 self.assertEqual('{{b'.format(), '{b')
990 self.assertEqual('}}b'.format(), '}b')
991 self.assertEqual('a{{b'.format(), 'a{b')
992
993 # examples from the PEP:
994 import datetime
995 self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
996 self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
997 "My name is Fred")
998 self.assertEqual("My name is {0} :-{{}}".format('Fred'),
999 "My name is Fred :-{}")
1000
1001 d = datetime.date(2007, 8, 18)
1002 self.assertEqual("The year is {0.year}".format(d),
1003 "The year is 2007")
1004
Eric Smith8c663262007-08-25 02:26:07 +00001005 # classes we'll use for testing
1006 class C:
1007 def __init__(self, x=100):
1008 self._x = x
1009 def __format__(self, spec):
1010 return spec
1011
1012 class D:
1013 def __init__(self, x):
1014 self.x = x
1015 def __format__(self, spec):
1016 return str(self.x)
1017
1018 # class with __str__, but no __format__
1019 class E:
1020 def __init__(self, x):
1021 self.x = x
1022 def __str__(self):
1023 return 'E(' + self.x + ')'
1024
1025 # class with __repr__, but no __format__ or __str__
1026 class F:
1027 def __init__(self, x):
1028 self.x = x
1029 def __repr__(self):
1030 return 'F(' + self.x + ')'
1031
1032 # class with __format__ that forwards to string, for some format_spec's
1033 class G:
1034 def __init__(self, x):
1035 self.x = x
1036 def __str__(self):
1037 return "string is " + self.x
1038 def __format__(self, format_spec):
1039 if format_spec == 'd':
1040 return 'G(' + self.x + ')'
1041 return object.__format__(self, format_spec)
1042
Eric Smith739e2ad2007-08-27 19:07:22 +00001043 class I(datetime.date):
1044 def __format__(self, format_spec):
1045 return self.strftime(format_spec)
1046
Eric Smith185e30c2007-08-30 22:23:08 +00001047 class J(int):
1048 def __format__(self, format_spec):
1049 return int.__format__(self * 2, format_spec)
1050
Guido van Rossum97c1adf2016-08-18 09:22:23 -07001051 class M:
1052 def __init__(self, x):
1053 self.x = x
1054 def __repr__(self):
1055 return 'M(' + self.x + ')'
1056 __str__ = None
1057
1058 class N:
1059 def __init__(self, x):
1060 self.x = x
1061 def __repr__(self):
1062 return 'N(' + self.x + ')'
1063 __format__ = None
Eric Smith8c663262007-08-25 02:26:07 +00001064
1065 self.assertEqual(''.format(), '')
1066 self.assertEqual('abc'.format(), 'abc')
1067 self.assertEqual('{0}'.format('abc'), 'abc')
1068 self.assertEqual('{0:}'.format('abc'), 'abc')
1069# self.assertEqual('{ 0 }'.format('abc'), 'abc')
1070 self.assertEqual('X{0}'.format('abc'), 'Xabc')
1071 self.assertEqual('{0}X'.format('abc'), 'abcX')
1072 self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
1073 self.assertEqual('{1}'.format(1, 'abc'), 'abc')
1074 self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
1075 self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
1076 self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
1077 self.assertEqual('{0}'.format(-15), '-15')
1078 self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
1079 self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
1080 self.assertEqual('{{'.format(), '{')
1081 self.assertEqual('}}'.format(), '}')
1082 self.assertEqual('{{}}'.format(), '{}')
1083 self.assertEqual('{{x}}'.format(), '{x}')
1084 self.assertEqual('{{{0}}}'.format(123), '{123}')
1085 self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
1086 self.assertEqual('}}{{'.format(), '}{')
1087 self.assertEqual('}}x{{'.format(), '}x{')
1088
Eric Smith7ade6482007-08-26 22:27:13 +00001089 # weird field names
1090 self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
1091 self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
Eric Smith4cb4e4e2007-09-03 08:40:29 +00001092 self.assertEqual("{0[ ]}".format({' ':3}), '3')
Eric Smith7ade6482007-08-26 22:27:13 +00001093
Eric Smith8c663262007-08-25 02:26:07 +00001094 self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
1095 self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
1096 self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
1097 self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
1098 self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
1099 self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
1100 self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
1101
Eric Smith8c663262007-08-25 02:26:07 +00001102 # strings
1103 self.assertEqual('{0:.3s}'.format('abc'), 'abc')
1104 self.assertEqual('{0:.3s}'.format('ab'), 'ab')
1105 self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
1106 self.assertEqual('{0:.0s}'.format('abcdef'), '')
1107 self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
1108 self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
1109 self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
1110 self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
1111 self.assertEqual('{0:x<0s}'.format('result'), 'result')
1112 self.assertEqual('{0:x<5s}'.format('result'), 'result')
1113 self.assertEqual('{0:x<6s}'.format('result'), 'result')
1114 self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
1115 self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
1116 self.assertEqual('{0: <7s}'.format('result'), 'result ')
1117 self.assertEqual('{0:<7s}'.format('result'), 'result ')
1118 self.assertEqual('{0:>7s}'.format('result'), ' result')
1119 self.assertEqual('{0:>8s}'.format('result'), ' result')
1120 self.assertEqual('{0:^8s}'.format('result'), ' result ')
1121 self.assertEqual('{0:^9s}'.format('result'), ' result ')
1122 self.assertEqual('{0:^10s}'.format('result'), ' result ')
Serhiy Storchakacf19cc32021-01-25 11:56:33 +02001123 self.assertEqual('{0:8s}'.format('result'), 'result ')
1124 self.assertEqual('{0:0s}'.format('result'), 'result')
1125 self.assertEqual('{0:08s}'.format('result'), 'result00')
1126 self.assertEqual('{0:<08s}'.format('result'), 'result00')
1127 self.assertEqual('{0:>08s}'.format('result'), '00result')
1128 self.assertEqual('{0:^08s}'.format('result'), '0result0')
Eric Smith8c663262007-08-25 02:26:07 +00001129 self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
1130 self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
1131 self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
1132
Eric V. Smith2ea97122014-04-14 11:55:10 -04001133 # issue 12546: use \x00 as a fill character
1134 self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
1135 self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
1136 self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
1137 self.assertEqual('{0:^6s}'.format('foo'), ' foo ')
1138
1139 self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
1140 self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
1141 self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
1142 self.assertEqual('{0:<6}'.format(3), '3 ')
1143
1144 self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
1145 self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
1146 self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
1147 self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
1148
1149 self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
1150 self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
1151 self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
1152 self.assertEqual('{0:^12}'.format(3+2.0j), ' (3+2j) ')
1153
Eric Smith8c663262007-08-25 02:26:07 +00001154 # format specifiers for user defined type
1155 self.assertEqual('{0:abc}'.format(C()), 'abc')
1156
Georg Brandld52429f2008-07-04 15:55:02 +00001157 # !r, !s and !a coercions
Eric Smith8c663262007-08-25 02:26:07 +00001158 self.assertEqual('{0!s}'.format('Hello'), 'Hello')
1159 self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
1160 self.assertEqual('{0!s:15}'.format('Hello'), 'Hello ')
1161 self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello ')
1162 self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
1163 self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
1164 self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +00001165 self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +00001166 self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'") # printable
1167 self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
Georg Brandl559e5d72008-06-11 18:37:52 +00001168 self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +00001169 self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +00001170 self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
Georg Brandl559e5d72008-06-11 18:37:52 +00001171 self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
1172 self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
Georg Brandld52429f2008-07-04 15:55:02 +00001173 self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
Eric Smith8c663262007-08-25 02:26:07 +00001174
Eric Smith8c663262007-08-25 02:26:07 +00001175 # test fallback to object.__format__
1176 self.assertEqual('{0}'.format({}), '{}')
1177 self.assertEqual('{0}'.format([]), '[]')
1178 self.assertEqual('{0}'.format([1]), '[1]')
Eric Smithe4d63172010-09-13 20:48:43 +00001179
Eric Smith8c663262007-08-25 02:26:07 +00001180 self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
Eric Smith8c663262007-08-25 02:26:07 +00001181 self.assertEqual('{0!s}'.format(G('data')), 'string is data')
1182
Andrew Svetlov2cd8ce42012-12-23 14:27:17 +02001183 self.assertRaises(TypeError, '{0:^10}'.format, E('data'))
1184 self.assertRaises(TypeError, '{0:^10s}'.format, E('data'))
1185 self.assertRaises(TypeError, '{0:>15s}'.format, G('data'))
Eric Smithe4d63172010-09-13 20:48:43 +00001186
Eric Smith739e2ad2007-08-27 19:07:22 +00001187 self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
1188 month=8,
1189 day=27)),
1190 "date: 2007-08-27")
1191
Eric Smith185e30c2007-08-30 22:23:08 +00001192 # test deriving from a builtin type and overriding __format__
1193 self.assertEqual("{0}".format(J(10)), "20")
1194
1195
Eric Smith8c663262007-08-25 02:26:07 +00001196 # string format specifiers
1197 self.assertEqual('{0:}'.format('a'), 'a')
1198
1199 # computed format specifiers
1200 self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
1201 self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
1202 self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
1203 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello ')
1204 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello ')
1205
1206 # test various errors
1207 self.assertRaises(ValueError, '{'.format)
1208 self.assertRaises(ValueError, '}'.format)
1209 self.assertRaises(ValueError, 'a{'.format)
1210 self.assertRaises(ValueError, 'a}'.format)
1211 self.assertRaises(ValueError, '{a'.format)
1212 self.assertRaises(ValueError, '}a'.format)
Eric Smith11529192007-09-04 23:04:22 +00001213 self.assertRaises(IndexError, '{0}'.format)
1214 self.assertRaises(IndexError, '{1}'.format, 'abc')
1215 self.assertRaises(KeyError, '{x}'.format)
Eric Smith8c663262007-08-25 02:26:07 +00001216 self.assertRaises(ValueError, "}{".format)
Eric Smith8c663262007-08-25 02:26:07 +00001217 self.assertRaises(ValueError, "abc{0:{}".format)
1218 self.assertRaises(ValueError, "{0".format)
Eric Smith11529192007-09-04 23:04:22 +00001219 self.assertRaises(IndexError, "{0.}".format)
1220 self.assertRaises(ValueError, "{0.}".format, 0)
Benjamin Peterson4d944742013-05-17 18:22:31 -05001221 self.assertRaises(ValueError, "{0[}".format)
Eric Smith4cb4e4e2007-09-03 08:40:29 +00001222 self.assertRaises(ValueError, "{0[}".format, [])
Eric Smith11529192007-09-04 23:04:22 +00001223 self.assertRaises(KeyError, "{0]}".format)
1224 self.assertRaises(ValueError, "{0.[]}".format, 0)
Eric Smith7ade6482007-08-26 22:27:13 +00001225 self.assertRaises(ValueError, "{0..foo}".format, 0)
Eric Smith11529192007-09-04 23:04:22 +00001226 self.assertRaises(ValueError, "{0[0}".format, 0)
1227 self.assertRaises(ValueError, "{0[0:foo}".format, 0)
1228 self.assertRaises(KeyError, "{c]}".format)
1229 self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
1230 self.assertRaises(ValueError, "{0}}".format, 0)
1231 self.assertRaises(KeyError, "{foo}".format, bar=3)
Eric Smith8c663262007-08-25 02:26:07 +00001232 self.assertRaises(ValueError, "{0!x}".format, 3)
Eric Smith11529192007-09-04 23:04:22 +00001233 self.assertRaises(ValueError, "{0!}".format, 0)
1234 self.assertRaises(ValueError, "{0!rs}".format, 0)
Eric Smith8c663262007-08-25 02:26:07 +00001235 self.assertRaises(ValueError, "{!}".format)
Eric Smith8ec90442009-03-14 12:29:34 +00001236 self.assertRaises(IndexError, "{:}".format)
1237 self.assertRaises(IndexError, "{:s}".format)
1238 self.assertRaises(IndexError, "{}".format)
Benjamin Peterson59a1b2f2010-06-07 22:31:26 +00001239 big = "23098475029384702983476098230754973209482573"
1240 self.assertRaises(ValueError, ("{" + big + "}").format)
1241 self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
Eric Smith8c663262007-08-25 02:26:07 +00001242
Eric Smith41669ca2009-05-23 14:23:22 +00001243 # issue 6089
1244 self.assertRaises(ValueError, "{0[0]x}".format, [None])
1245 self.assertRaises(ValueError, "{0[0](10)}".format, [None])
1246
Eric Smith8c663262007-08-25 02:26:07 +00001247 # can't have a replacement on the field name portion
1248 self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
1249
1250 # exceed maximum recursion depth
1251 self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
1252 self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1253 0, 1, 2, 3, 4, 5, 6, 7)
1254
1255 # string format spec errors
Miss Islington (bot)2d780232021-05-13 14:24:49 -07001256 sign_msg = "Sign not allowed in string format specifier"
1257 self.assertRaisesRegex(ValueError, sign_msg, "{0:-s}".format, '')
1258 self.assertRaisesRegex(ValueError, sign_msg, format, "", "-")
1259 space_msg = "Space not allowed in string format specifier"
1260 self.assertRaisesRegex(ValueError, space_msg, "{: }".format, '')
Eric Smith8c663262007-08-25 02:26:07 +00001261 self.assertRaises(ValueError, "{0:=s}".format, '')
1262
Eric Smithb1ebcc62008-07-15 13:02:41 +00001263 # Alternate formatting is not supported
1264 self.assertRaises(ValueError, format, '', '#')
1265 self.assertRaises(ValueError, format, '', '#20')
1266
Victor Stinnerece58de2012-04-23 23:36:38 +02001267 # Non-ASCII
1268 self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"),
1269 'ABC\u0410\u0411\u0412')
1270 self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"),
1271 'ABC')
1272 self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"),
1273 '')
1274
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001275 self.assertEqual("{[{}]}".format({"{}": 5}), "5")
Benjamin Peterson4d944742013-05-17 18:22:31 -05001276 self.assertEqual("{[{}]}".format({"{}" : "a"}), "a")
1277 self.assertEqual("{[{]}".format({"{" : "a"}), "a")
1278 self.assertEqual("{[}]}".format({"}" : "a"}), "a")
1279 self.assertEqual("{[[]}".format({"[" : "a"}), "a")
1280 self.assertEqual("{[!]}".format({"!" : "a"}), "a")
1281 self.assertRaises(ValueError, "{a{}b}".format, 42)
1282 self.assertRaises(ValueError, "{a{b}".format, 42)
1283 self.assertRaises(ValueError, "{[}".format, 42)
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001284
Benjamin Peterson0ee22bf2013-11-26 19:22:36 -06001285 self.assertEqual("0x{:0{:d}X}".format(0x0,16), "0x0000000000000000")
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001286
Guido van Rossum97c1adf2016-08-18 09:22:23 -07001287 # Blocking fallback
1288 m = M('data')
1289 self.assertEqual("{!r}".format(m), 'M(data)')
1290 self.assertRaises(TypeError, "{!s}".format, m)
1291 self.assertRaises(TypeError, "{}".format, m)
1292 n = N('data')
1293 self.assertEqual("{!r}".format(n), 'N(data)')
1294 self.assertEqual("{!s}".format(n), 'N(data)')
1295 self.assertRaises(TypeError, "{}".format, n)
1296
Eric Smith27bbca62010-11-04 17:06:58 +00001297 def test_format_map(self):
1298 self.assertEqual(''.format_map({}), '')
1299 self.assertEqual('a'.format_map({}), 'a')
1300 self.assertEqual('ab'.format_map({}), 'ab')
1301 self.assertEqual('a{{'.format_map({}), 'a{')
1302 self.assertEqual('a}}'.format_map({}), 'a}')
1303 self.assertEqual('{{b'.format_map({}), '{b')
1304 self.assertEqual('}}b'.format_map({}), '}b')
1305 self.assertEqual('a{{b'.format_map({}), 'a{b')
1306
1307 # using mappings
1308 class Mapping(dict):
1309 def __missing__(self, key):
1310 return key
1311 self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
1312 self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
1313
1314 class InternalMapping:
1315 def __init__(self):
1316 self.mapping = {'a': 'hello'}
1317 def __getitem__(self, key):
1318 return self.mapping[key]
1319 self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
1320
1321
Eric Smith27bbca62010-11-04 17:06:58 +00001322 class C:
1323 def __init__(self, x=100):
1324 self._x = x
1325 def __format__(self, spec):
1326 return spec
Eric Smith27bbca62010-11-04 17:06:58 +00001327 self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
1328
1329 # test various errors
Eric V. Smithedbb6ca2012-03-12 15:16:22 -07001330 self.assertRaises(TypeError, ''.format_map)
1331 self.assertRaises(TypeError, 'a'.format_map)
1332
1333 self.assertRaises(ValueError, '{'.format_map, {})
1334 self.assertRaises(ValueError, '}'.format_map, {})
1335 self.assertRaises(ValueError, 'a{'.format_map, {})
1336 self.assertRaises(ValueError, 'a}'.format_map, {})
1337 self.assertRaises(ValueError, '{a'.format_map, {})
1338 self.assertRaises(ValueError, '}a'.format_map, {})
Eric Smith27bbca62010-11-04 17:06:58 +00001339
Eric V. Smith12ebefc2011-07-18 14:03:41 -04001340 # issue #12579: can't supply positional params to format_map
1341 self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
1342 self.assertRaises(ValueError, '{}'.format_map, 'a')
1343 self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
1344
Serhiy Storchaka50754162017-08-03 11:45:23 +03001345 class BadMapping:
1346 def __getitem__(self, key):
1347 return 1/0
1348 self.assertRaises(KeyError, '{a}'.format_map, {})
1349 self.assertRaises(TypeError, '{a}'.format_map, [])
1350 self.assertRaises(ZeroDivisionError, '{a}'.format_map, BadMapping())
1351
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001352 def test_format_huge_precision(self):
1353 format_string = ".{}f".format(sys.maxsize + 1)
1354 with self.assertRaises(ValueError):
1355 result = format(2.34, format_string)
1356
1357 def test_format_huge_width(self):
1358 format_string = "{}f".format(sys.maxsize + 1)
1359 with self.assertRaises(ValueError):
1360 result = format(2.34, format_string)
1361
1362 def test_format_huge_item_number(self):
1363 format_string = "{{{}:.6f}}".format(sys.maxsize + 1)
1364 with self.assertRaises(ValueError):
1365 result = format_string.format(2.34)
1366
Eric Smith8ec90442009-03-14 12:29:34 +00001367 def test_format_auto_numbering(self):
1368 class C:
1369 def __init__(self, x=100):
1370 self._x = x
1371 def __format__(self, spec):
1372 return spec
1373
1374 self.assertEqual('{}'.format(10), '10')
1375 self.assertEqual('{:5}'.format('s'), 's ')
1376 self.assertEqual('{!r}'.format('s'), "'s'")
1377 self.assertEqual('{._x}'.format(C(10)), '10')
1378 self.assertEqual('{[1]}'.format([1, 2]), '2')
1379 self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
1380 self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
1381
1382 self.assertEqual('a{:{}}b'.format('x', '^10'), 'a x b')
1383 self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
1384
1385 # can't mix and match numbering and auto-numbering
1386 self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
1387 self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
1388 self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
1389 self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
1390
1391 # can mix and match auto-numbering and named
1392 self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
1393 self.assertEqual('{}{f}'.format(4, f='test'), '4test')
1394 self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
1395 self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
1396
Walter Dörwald28256f22003-01-19 16:59:20 +00001397 def test_formatting(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +00001398 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
Walter Dörwald28256f22003-01-19 16:59:20 +00001399 # Testing Unicode formatting strings...
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001400 self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
1401 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000, 3.00')
1402 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000, 3.00')
1403 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000, 3.50')
1404 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000, 3.57')
1405 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
Walter Dörwald28256f22003-01-19 16:59:20 +00001406 if not sys.platform.startswith('java'):
Walter Dörwald67e83882007-05-05 12:26:27 +00001407 self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
Georg Brandl559e5d72008-06-11 18:37:52 +00001408 self.assertEqual("%r" % ("\u1234",), "'\u1234'")
1409 self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001410 self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
1411 self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
Walter Dörwald56fbcb52003-03-31 18:18:41 +00001412
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001413 self.assertEqual('%c' % 0x1234, '\u1234')
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00001414 self.assertEqual('%c' % 0x21483, '\U00021483')
1415 self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
1416 self.assertEqual('%c' % '\U00021483', '\U00021483')
1417 self.assertRaises(TypeError, "%c".__mod__, "aa")
Stefan Krah99212f62010-07-19 17:58:26 +00001418 self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -07001419 self.assertRaises(TypeError, "%i".__mod__, "aa")
Walter Dörwald28256f22003-01-19 16:59:20 +00001420
1421 # formatting jobs delegated from the string implementation:
Walter Dörwald28256f22003-01-19 16:59:20 +00001422 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001423 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1424 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1425 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1426 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1427 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1428 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
1429 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
1430 self.assertEqual('...%s...' % "abc", '...abc...')
1431 self.assertEqual('%*s' % (5,'abc',), ' abc')
1432 self.assertEqual('%*s' % (-5,'abc',), 'abc ')
1433 self.assertEqual('%*.*s' % (5,2,'abc',), ' ab')
1434 self.assertEqual('%*.*s' % (5,3,'abc',), ' abc')
1435 self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10 abc')
1436 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103 abc')
1437 self.assertEqual('%c' % 'a', 'a')
Neil Schemenauercf52c072005-08-12 17:34:58 +00001438 class Wrapper:
1439 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001440 return '\u1234'
1441 self.assertEqual('%s' % Wrapper(), '\u1234')
Walter Dörwald28256f22003-01-19 16:59:20 +00001442
Eric Smith741191f2009-05-06 13:08:15 +00001443 # issue 3382
1444 NAN = float('nan')
1445 INF = float('inf')
1446 self.assertEqual('%f' % NAN, 'nan')
1447 self.assertEqual('%F' % NAN, 'NAN')
1448 self.assertEqual('%f' % INF, 'inf')
1449 self.assertEqual('%F' % INF, 'INF')
1450
Victor Stinnerf59c28c2012-05-09 03:24:14 +02001451 # PEP 393
1452 self.assertEqual('%.1s' % "a\xe9\u20ac", 'a')
1453 self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9')
1454
Ethan Furmandf3ed242014-01-05 06:50:30 -08001455 #issue 19995
Ethan Furman9ab74802014-03-21 06:38:46 -07001456 class PseudoInt:
Ethan Furmandf3ed242014-01-05 06:50:30 -08001457 def __init__(self, value):
1458 self.value = int(value)
1459 def __int__(self):
1460 return self.value
1461 def __index__(self):
1462 return self.value
Ethan Furman9ab74802014-03-21 06:38:46 -07001463 class PseudoFloat:
Ethan Furmandf3ed242014-01-05 06:50:30 -08001464 def __init__(self, value):
1465 self.value = float(value)
1466 def __int__(self):
1467 return int(self.value)
Ethan Furman9ab74802014-03-21 06:38:46 -07001468 pi = PseudoFloat(3.1415)
1469 letter_m = PseudoInt(109)
Antoine Pitroueb168042014-01-10 00:02:38 +01001470 self.assertEqual('%x' % 42, '2a')
1471 self.assertEqual('%X' % 15, 'F')
1472 self.assertEqual('%o' % 9, '11')
1473 self.assertEqual('%c' % 109, 'm')
1474 self.assertEqual('%x' % letter_m, '6d')
1475 self.assertEqual('%X' % letter_m, '6D')
1476 self.assertEqual('%o' % letter_m, '155')
1477 self.assertEqual('%c' % letter_m, 'm')
Ethan Furman9ab74802014-03-21 06:38:46 -07001478 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14),
1479 self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11),
1480 self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79),
1481 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi),
1482 self.assertRaises(TypeError, operator.mod, '%c', pi),
Ethan Furmandf3ed242014-01-05 06:50:30 -08001483
Ethan Furmanfb137212013-08-31 10:18:55 -07001484 def test_formatting_with_enum(self):
1485 # issue18780
1486 import enum
1487 class Float(float, enum.Enum):
1488 PI = 3.1415926
1489 class Int(enum.IntEnum):
1490 IDES = 15
Ethan Furman9bf7c2d2021-07-03 21:08:42 -07001491 class Str(str, enum.Enum):
Ethan Furmanfb137212013-08-31 10:18:55 -07001492 ABC = 'abc'
1493 # Testing Unicode formatting strings...
Ethan Furman13bdfa72013-08-31 12:48:51 -07001494 self.assertEqual("%s, %s" % (Str.ABC, Str.ABC),
Ethan Furman9bf7c2d2021-07-03 21:08:42 -07001495 'Str.ABC, Str.ABC')
Ethan Furman13bdfa72013-08-31 12:48:51 -07001496 self.assertEqual("%s, %s, %d, %i, %u, %f, %5.2f" %
1497 (Str.ABC, Str.ABC,
1498 Int.IDES, Int.IDES, Int.IDES,
1499 Float.PI, Float.PI),
Ethan Furman9bf7c2d2021-07-03 21:08:42 -07001500 'Str.ABC, Str.ABC, 15, 15, 15, 3.141593, 3.14')
Ethan Furmanfb137212013-08-31 10:18:55 -07001501
1502 # formatting jobs delegated from the string implementation:
Ethan Furman13bdfa72013-08-31 12:48:51 -07001503 self.assertEqual('...%(foo)s...' % {'foo':Str.ABC},
Ethan Furman9bf7c2d2021-07-03 21:08:42 -07001504 '...Str.ABC...')
Ethan Furman13bdfa72013-08-31 12:48:51 -07001505 self.assertEqual('...%(foo)s...' % {'foo':Int.IDES},
Ethan Furman9bf7c2d2021-07-03 21:08:42 -07001506 '...Int.IDES...')
Ethan Furman13bdfa72013-08-31 12:48:51 -07001507 self.assertEqual('...%(foo)i...' % {'foo':Int.IDES},
1508 '...15...')
1509 self.assertEqual('...%(foo)d...' % {'foo':Int.IDES},
1510 '...15...')
1511 self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI},
1512 '...15...')
1513 self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123},
1514 '...3.141593...')
Ethan Furmanfb137212013-08-31 10:18:55 -07001515
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001516 def test_formatting_huge_precision(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001517 format_string = "%.{}f".format(sys.maxsize + 1)
1518 with self.assertRaises(ValueError):
1519 result = format_string % 2.34
1520
Martijn Pietersd7e64332017-02-23 13:38:04 +00001521 def test_issue28598_strsubclass_rhs(self):
1522 # A subclass of str with an __rmod__ method should be able to hook
1523 # into the % operator
1524 class SubclassedStr(str):
1525 def __rmod__(self, other):
1526 return 'Success, self.__rmod__({!r}) was called'.format(other)
1527 self.assertEqual('lhs %% %r' % SubclassedStr('rhs'),
1528 "Success, self.__rmod__('lhs %% %r') was called")
1529
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001530 @support.cpython_only
1531 def test_formatting_huge_precision_c_limits(self):
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001532 from _testcapi import INT_MAX
1533 format_string = "%.{}f".format(INT_MAX + 1)
1534 with self.assertRaises(ValueError):
1535 result = format_string % 2.34
1536
1537 def test_formatting_huge_width(self):
1538 format_string = "%{}f".format(sys.maxsize + 1)
1539 with self.assertRaises(ValueError):
1540 result = format_string % 2.34
1541
Ezio Melottiba42fd52011-04-26 06:09:45 +03001542 def test_startswith_endswith_errors(self):
1543 for meth in ('foo'.startswith, 'foo'.endswith):
Ezio Melottif2b3f782011-04-26 06:40:59 +03001544 with self.assertRaises(TypeError) as cm:
Ezio Melottiba42fd52011-04-26 06:09:45 +03001545 meth(['f'])
Ezio Melottif2b3f782011-04-26 06:40:59 +03001546 exc = str(cm.exception)
Ezio Melottiba42fd52011-04-26 06:09:45 +03001547 self.assertIn('str', exc)
1548 self.assertIn('tuple', exc)
1549
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001550 @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
Georg Brandlda6b1072006-01-20 17:48:54 +00001551 def test_format_float(self):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001552 # should not format with a comma, but always with C locale
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001553 self.assertEqual('1.0', '%.1f' % 1.0)
Georg Brandlda6b1072006-01-20 17:48:54 +00001554
Walter Dörwald28256f22003-01-19 16:59:20 +00001555 def test_constructor(self):
1556 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1557
1558 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001559 str('unicode remains unicode'),
1560 'unicode remains unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001561 )
1562
Victor Stinner07ac3eb2011-10-01 16:16:43 +02001563 for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
Serhiy Storchaka15095802015-11-25 15:47:01 +02001564 subclass = StrSubclass(text)
Victor Stinner07ac3eb2011-10-01 16:16:43 +02001565 self.assertEqual(str(subclass), text)
1566 self.assertEqual(len(subclass), len(text))
1567 if text == 'ascii':
1568 self.assertEqual(subclass.encode('ascii'), b'ascii')
1569 self.assertEqual(subclass.encode('utf-8'), b'ascii')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001570
Walter Dörwald28256f22003-01-19 16:59:20 +00001571 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001572 str('strings are converted to unicode'),
1573 'strings are converted to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001574 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001575
Walter Dörwald28256f22003-01-19 16:59:20 +00001576 class StringCompat:
1577 def __init__(self, x):
1578 self.x = x
1579 def __str__(self):
1580 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001581
Walter Dörwald28256f22003-01-19 16:59:20 +00001582 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001583 str(StringCompat('__str__ compatible objects are recognized')),
1584 '__str__ compatible objects are recognized'
Walter Dörwald28256f22003-01-19 16:59:20 +00001585 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001586
Walter Dörwald28256f22003-01-19 16:59:20 +00001587 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001588
Walter Dörwald28256f22003-01-19 16:59:20 +00001589 o = StringCompat('unicode(obj) is compatible to str()')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001590 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Walter Dörwald28256f22003-01-19 16:59:20 +00001591 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001592
Guido van Rossume2a383d2007-01-15 16:59:06 +00001593 for obj in (123, 123.45, 123):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001594 self.assertEqual(str(obj), str(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001595
Walter Dörwald28256f22003-01-19 16:59:20 +00001596 # unicode(obj, encoding, error) tests (this maps to
1597 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001598
Walter Dörwald28256f22003-01-19 16:59:20 +00001599 if not sys.platform.startswith('java'):
1600 self.assertRaises(
1601 TypeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001602 str,
1603 'decoding unicode is not supported',
Walter Dörwald28256f22003-01-19 16:59:20 +00001604 'utf-8',
1605 'strict'
1606 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001607
Walter Dörwald28256f22003-01-19 16:59:20 +00001608 self.assertEqual(
Walter Dörwald67e83882007-05-05 12:26:27 +00001609 str(b'strings are decoded to unicode', 'utf-8', 'strict'),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001610 'strings are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001611 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001612
Walter Dörwald28256f22003-01-19 16:59:20 +00001613 if not sys.platform.startswith('java'):
1614 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001615 str(
Guido van Rossumbae07c92007-10-08 02:46:15 +00001616 memoryview(b'character buffers are decoded to unicode'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001617 'utf-8',
1618 'strict'
1619 ),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001620 'character buffers are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001621 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001622
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001623 self.assertRaises(TypeError, str, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001624
Chris Jerdonek5fae0e52012-11-20 17:45:51 -08001625 def test_constructor_keyword_args(self):
1626 """Pass various keyword argument combinations to the constructor."""
1627 # The object argument can be passed as a keyword.
1628 self.assertEqual(str(object='foo'), 'foo')
1629 self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo')
1630 # The errors argument without encoding triggers "decode" mode.
1631 self.assertEqual(str(b'foo', errors='strict'), 'foo') # not "b'foo'"
1632 self.assertEqual(str(object=b'foo', errors='strict'), 'foo')
1633
1634 def test_constructor_defaults(self):
1635 """Check the constructor argument defaults."""
1636 # The object argument defaults to '' or b''.
1637 self.assertEqual(str(), '')
1638 self.assertEqual(str(errors='strict'), '')
1639 utf8_cent = '¢'.encode('utf-8')
1640 # The encoding argument defaults to utf-8.
1641 self.assertEqual(str(utf8_cent, errors='strict'), '¢')
1642 # The errors argument defaults to strict.
1643 self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii')
1644
Walter Dörwald28256f22003-01-19 16:59:20 +00001645 def test_codecs_utf7(self):
1646 utfTests = [
Walter Dörwald67e83882007-05-05 12:26:27 +00001647 ('A\u2262\u0391.', b'A+ImIDkQ.'), # RFC2152 example
1648 ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'), # RFC2152 example
1649 ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'), # RFC2152 example
1650 ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1651 ('+', b'+-'),
1652 ('+-', b'+--'),
1653 ('+?', b'+-?'),
R David Murray44b548d2016-09-08 13:59:53 -04001654 (r'\?', b'+AFw?'),
Walter Dörwald67e83882007-05-05 12:26:27 +00001655 ('+?', b'+-?'),
1656 (r'\\?', b'+AFwAXA?'),
1657 (r'\\\?', b'+AFwAXABc?'),
Antoine Pitrou244651a2009-05-04 18:56:13 +00001658 (r'++--', b'+-+---'),
1659 ('\U000abcde', b'+2m/c3g-'), # surrogate pairs
1660 ('/', b'/'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001661 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001662
Walter Dörwald28256f22003-01-19 16:59:20 +00001663 for (x, y) in utfTests:
1664 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001665
Antoine Pitrou5418ee02011-11-15 01:42:21 +01001666 # Unpaired surrogates are passed through
1667 self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
1668 self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
1669 self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
1670 self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
1671 self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
1672 self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
1673 self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
1674 self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001675
Antoine Pitrou5418ee02011-11-15 01:42:21 +01001676 self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
1677 self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001678
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001679 # Issue #2242: crash on some Windows/MSVC versions
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001680 self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')
Antoine Pitrou244651a2009-05-04 18:56:13 +00001681
1682 # Direct encoded characters
1683 set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1684 # Optional direct characters
1685 set_o = '!"#$%&*;<=>@[]^_`{|}'
1686 for c in set_d:
1687 self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1688 self.assertEqual(c.encode('ascii').decode('utf7'), c)
1689 for c in set_o:
1690 self.assertEqual(c.encode('ascii').decode('utf7'), c)
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001691
Zackery Spytze349bf22018-08-18 22:43:38 -06001692 with self.assertRaisesRegex(UnicodeDecodeError,
1693 'ill-formed sequence'):
1694 b'+@'.decode('utf-7')
1695
Walter Dörwald28256f22003-01-19 16:59:20 +00001696 def test_codecs_utf8(self):
Walter Dörwald67e83882007-05-05 12:26:27 +00001697 self.assertEqual(''.encode('utf-8'), b'')
1698 self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
Ezio Melottia9860ae2011-10-04 19:06:00 +03001699 self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82')
1700 self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96')
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001701 self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1702 self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
Ezio Melottia9860ae2011-10-04 19:06:00 +03001703 self.assertEqual(('\U00010002'*10).encode('utf-8'),
1704 b'\xf0\x90\x80\x82'*10)
Walter Dörwald28256f22003-01-19 16:59:20 +00001705 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001706 '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1707 '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1708 '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1709 '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1710 '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1711 ' Nunstuck git und'.encode('utf-8'),
Walter Dörwald67e83882007-05-05 12:26:27 +00001712 b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1713 b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1714 b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1715 b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1716 b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1717 b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1718 b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1719 b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1720 b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1721 b'\xe3\x80\x8cWenn ist das Nunstuck git und'
Walter Dörwald28256f22003-01-19 16:59:20 +00001722 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001723
Walter Dörwald28256f22003-01-19 16:59:20 +00001724 # UTF-8 specific decoding tests
Walter Dörwald67e83882007-05-05 12:26:27 +00001725 self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1726 self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1727 self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001728
Walter Dörwald28256f22003-01-19 16:59:20 +00001729 # Other possible utf-8 test cases:
1730 # * strict decoding testing for all of the
1731 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732
Ezio Melotti57221d02010-07-01 07:32:02 +00001733 def test_utf8_decode_valid_sequences(self):
1734 sequences = [
1735 # single byte
1736 (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1737 # 2 bytes
1738 (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1739 # 3 bytes
1740 (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1741 (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1742 # 4 bytes
1743 (b'\xF0\x90\x80\x80', '\U00010000'),
1744 (b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1745 ]
1746 for seq, res in sequences:
1747 self.assertEqual(seq.decode('utf-8'), res)
1748
1749
1750 def test_utf8_decode_invalid_sequences(self):
1751 # continuation bytes in a sequence of 2, 3, or 4 bytes
1752 continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001753 # start bytes of a 2-byte sequence equivalent to code points < 0x7F
Ezio Melotti57221d02010-07-01 07:32:02 +00001754 invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001755 # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
Ezio Melotti57221d02010-07-01 07:32:02 +00001756 invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1757 invalid_start_bytes = (
1758 continuation_bytes + invalid_2B_seq_start_bytes +
1759 invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1760 )
1761
1762 for byte in invalid_start_bytes:
1763 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1764
1765 for sb in invalid_2B_seq_start_bytes:
1766 for cb in continuation_bytes:
1767 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1768
1769 for sb in invalid_4B_seq_start_bytes:
1770 for cb1 in continuation_bytes[:3]:
1771 for cb3 in continuation_bytes[:3]:
1772 self.assertRaises(UnicodeDecodeError,
1773 (sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1774
1775 for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1776 self.assertRaises(UnicodeDecodeError,
1777 (b'\xE0'+cb+b'\x80').decode, 'utf-8')
1778 self.assertRaises(UnicodeDecodeError,
1779 (b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1780 # surrogates
1781 for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1782 self.assertRaises(UnicodeDecodeError,
1783 (b'\xED'+cb+b'\x80').decode, 'utf-8')
1784 self.assertRaises(UnicodeDecodeError,
1785 (b'\xED'+cb+b'\xBF').decode, 'utf-8')
1786 for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1787 self.assertRaises(UnicodeDecodeError,
1788 (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1789 self.assertRaises(UnicodeDecodeError,
1790 (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1791 for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1792 self.assertRaises(UnicodeDecodeError,
1793 (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1794 self.assertRaises(UnicodeDecodeError,
1795 (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1796
1797 def test_issue8271(self):
1798 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1799 # only the start byte and the continuation byte(s) are now considered
1800 # invalid, instead of the number of bytes specified by the start byte.
Benjamin Peterson51796e52020-03-10 21:10:59 -07001801 # See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
Ezio Melotti57221d02010-07-01 07:32:02 +00001802 # table 3-8, Row 2) for more information about the algorithm used.
1803 FFFD = '\ufffd'
1804 sequences = [
1805 # invalid start bytes
1806 (b'\x80', FFFD), # continuation byte
1807 (b'\x80\x80', FFFD*2), # 2 continuation bytes
1808 (b'\xc0', FFFD),
1809 (b'\xc0\xc0', FFFD*2),
1810 (b'\xc1', FFFD),
1811 (b'\xc1\xc0', FFFD*2),
1812 (b'\xc0\xc1', FFFD*2),
1813 # with start byte of a 2-byte sequence
1814 (b'\xc2', FFFD), # only the start byte
1815 (b'\xc2\xc2', FFFD*2), # 2 start bytes
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001816 (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes
Ezio Melotti57221d02010-07-01 07:32:02 +00001817 (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1818 # with start byte of a 3-byte sequence
1819 (b'\xe1', FFFD), # only the start byte
1820 (b'\xe1\xe1', FFFD*2), # 2 start bytes
1821 (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1822 (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1823 (b'\xe1\x80', FFFD), # only 1 continuation byte
1824 (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1825 (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1826 (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1827 (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1828 (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1829 (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1830 # with start byte of a 4-byte sequence
1831 (b'\xf1', FFFD), # only the start byte
1832 (b'\xf1\xf1', FFFD*2), # 2 start bytes
1833 (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1834 (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1835 (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1836 (b'\xf1\x80', FFFD), # only 1 continuation bytes
1837 (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1838 (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1839 (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1840 (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1841 (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1842 (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1843 (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1844 (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1845 (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1846 (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1847 (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1848 (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1849 # with invalid start byte of a 4-byte sequence (rfc2279)
1850 (b'\xf5', FFFD), # only the start byte
1851 (b'\xf5\xf5', FFFD*2), # 2 start bytes
1852 (b'\xf5\x80', FFFD*2), # only 1 continuation byte
1853 (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1854 (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1855 (b'\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
1856 (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1857 (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1858 # with invalid start byte of a 5-byte sequence (rfc2279)
1859 (b'\xf8', FFFD), # only the start byte
1860 (b'\xf8\xf8', FFFD*2), # 2 start bytes
1861 (b'\xf8\x80', FFFD*2), # only one continuation byte
1862 (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1863 (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1864 # with invalid start byte of a 6-byte sequence (rfc2279)
1865 (b'\xfc', FFFD), # only the start byte
1866 (b'\xfc\xfc', FFFD*2), # 2 start bytes
1867 (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1868 (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1869 # invalid start byte
1870 (b'\xfe', FFFD),
1871 (b'\xfe\x80\x80', FFFD*3),
1872 # other sequences
1873 (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1874 (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1875 (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1876 (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1877 '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1878 ]
1879 for n, (seq, res) in enumerate(sequences):
1880 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1881 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1882 self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1883 self.assertEqual(seq.decode('utf-8', 'ignore'),
1884 res.replace('\uFFFD', ''))
1885
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001886 def assertCorrectUTF8Decoding(self, seq, res, err):
1887 """
Martin Panter6245cb32016-04-15 02:14:19 +00001888 Check that an invalid UTF-8 sequence raises a UnicodeDecodeError when
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001889 'strict' is used, returns res when 'replace' is used, and that doesn't
1890 return anything when 'ignore' is used.
1891 """
1892 with self.assertRaises(UnicodeDecodeError) as cm:
1893 seq.decode('utf-8')
1894 exc = cm.exception
1895
1896 self.assertIn(err, str(exc))
1897 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1898 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'),
1899 'aaaa' + res + 'bbbb')
1900 res = res.replace('\ufffd', '')
1901 self.assertEqual(seq.decode('utf-8', 'ignore'), res)
1902 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'),
1903 'aaaa' + res + 'bbbb')
1904
1905 def test_invalid_start_byte(self):
1906 """
1907 Test that an 'invalid start byte' error is raised when the first byte
1908 is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
1909 4-bytes sequence. The invalid start byte is replaced with a single
1910 U+FFFD when errors='replace'.
1911 E.g. <80> is a continuation byte and can appear only after a start byte.
1912 """
1913 FFFD = '\ufffd'
1914 for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
1915 self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd',
1916 'invalid start byte')
1917
1918 def test_unexpected_end_of_data(self):
1919 """
1920 Test that an 'unexpected end of data' error is raised when the string
1921 ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
1922 enough continuation bytes. The incomplete sequence is replaced with a
1923 single U+FFFD when errors='replace'.
1924 E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
1925 sequence, but it's followed by only 2 valid continuation bytes and the
1926 last continuation bytes is missing.
1927 Note: the continuation bytes must be all valid, if one of them is
1928 invalid another error will be raised.
1929 """
1930 sequences = [
1931 'C2', 'DF',
1932 'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
1933 'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
1934 'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
1935 'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
1936 'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
1937 'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
1938 ]
1939 FFFD = '\ufffd'
1940 for seq in sequences:
Serhiy Storchaka8cbd3df2016-12-21 12:59:28 +02001941 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), '\ufffd',
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001942 'unexpected end of data')
1943
1944 def test_invalid_cb_for_2bytes_seq(self):
1945 """
1946 Test that an 'invalid continuation byte' error is raised when the
1947 continuation byte of a 2-bytes sequence is invalid. The start byte
1948 is replaced by a single U+FFFD and the second byte is handled
1949 separately when errors='replace'.
1950 E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
1951 sequence, but 41 is not a valid continuation byte because it's the
1952 ASCII letter 'A'.
1953 """
1954 FFFD = '\ufffd'
1955 FFFDx2 = FFFD * 2
1956 sequences = [
1957 ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'),
1958 ('C2 C0', FFFDx2), ('C2 FF', FFFDx2),
1959 ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'),
1960 ('DF C0', FFFDx2), ('DF FF', FFFDx2),
1961 ]
1962 for seq, res in sequences:
Serhiy Storchaka8cbd3df2016-12-21 12:59:28 +02001963 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001964 'invalid continuation byte')
1965
1966 def test_invalid_cb_for_3bytes_seq(self):
1967 """
1968 Test that an 'invalid continuation byte' error is raised when the
1969 continuation byte(s) of a 3-bytes sequence are invalid. When
1970 errors='replace', if the first continuation byte is valid, the first
1971 two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
1972 third byte is handled separately, otherwise only the start byte is
1973 replaced with a U+FFFD and the other continuation bytes are handled
1974 separately.
1975 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1976 sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1977 because it's the ASCII letter 'A'.
1978 Note: when the start byte is E0 or ED, the valid ranges for the first
1979 continuation byte are limited to A0..BF and 80..9F respectively.
1980 Python 2 used to consider all the bytes in range 80..BF valid when the
1981 start byte was ED. This is fixed in Python 3.
1982 """
1983 FFFD = '\ufffd'
1984 FFFDx2 = FFFD * 2
1985 sequences = [
1986 ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2),
1987 ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
1988 ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'),
1989 ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
1990 ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'),
1991 ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'),
1992 ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
1993 ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'),
1994 ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
1995 ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'),
1996 ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'),
1997 ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
1998 ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'),
1999 ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
2000 ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'),
2001 ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'),
2002 ('ED 7F', FFFD+'\x7f'),
2003 ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
2004 ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'),
2005 ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2),
2006 ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'),
2007 ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2),
2008 ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'),
2009 ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
2010 ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'),
2011 ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
2012 ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'),
2013 ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'),
2014 ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
2015 ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'),
2016 ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
2017 ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'),
2018 ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
2019 ]
2020 for seq, res in sequences:
Serhiy Storchaka8cbd3df2016-12-21 12:59:28 +02002021 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
Ezio Melottif7ed5d12012-11-04 23:21:38 +02002022 'invalid continuation byte')
2023
2024 def test_invalid_cb_for_4bytes_seq(self):
2025 """
2026 Test that an 'invalid continuation byte' error is raised when the
2027 continuation byte(s) of a 4-bytes sequence are invalid. When
2028 errors='replace',the start byte and all the following valid
2029 continuation bytes are replaced with a single U+FFFD, and all the bytes
2030 starting from the first invalid continuation bytes (included) are
2031 handled separately.
2032 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
2033 sequence, 80 is a valid continuation byte, but 41 is not a valid cb
2034 because it's the ASCII letter 'A'.
2035 Note: when the start byte is E0 or ED, the valid ranges for the first
2036 continuation byte are limited to A0..BF and 80..9F respectively.
2037 However, when the start byte is ED, Python 2 considers all the bytes
2038 in range 80..BF valid. This is fixed in Python 3.
2039 """
2040 FFFD = '\ufffd'
2041 FFFDx2 = FFFD * 2
2042 sequences = [
2043 ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2),
2044 ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
2045 ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'),
2046 ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
2047 ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'),
2048 ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
2049 ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'),
2050 ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
2051 ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'),
2052 ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
2053 ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'),
2054 ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
2055 ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'),
2056 ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
2057 ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2),
2058 ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'),
2059 ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2),
2060 ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'),
2061 ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2),
2062 ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'),
2063 ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2),
2064 ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'),
2065 ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2),
2066 ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'),
2067 ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2),
2068 ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'),
2069 ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2),
2070 ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'),
2071 ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
2072 ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'),
2073 ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
2074 ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'),
2075 ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
2076 ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'),
2077 ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
2078 ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'),
2079 ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
2080 ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'),
2081 ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
2082 ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'),
2083 ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
2084 ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2),
2085 ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
2086 ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'),
2087 ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
2088 ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'),
2089 ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
2090 ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'),
2091 ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
2092 ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'),
2093 ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
2094 ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'),
2095 ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
2096 ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'),
2097 ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
2098 ]
2099 for seq, res in sequences:
Serhiy Storchaka8cbd3df2016-12-21 12:59:28 +02002100 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
Ezio Melottif7ed5d12012-11-04 23:21:38 +02002101 'invalid continuation byte')
2102
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00002103 def test_codecs_idna(self):
2104 # Test whether trailing dot is preserved
Walter Dörwald1324c6f2007-05-11 19:57:05 +00002105 self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00002106
Walter Dörwald28256f22003-01-19 16:59:20 +00002107 def test_codecs_errors(self):
2108 # Error handling (encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002109 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
2110 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
Walter Dörwald67e83882007-05-05 12:26:27 +00002111 self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
2112 self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
Benjamin Peterson308d6372009-09-18 21:42:35 +00002113 self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
2114 'Andr\202 x'.encode('ascii', errors='replace'))
2115 self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
2116 'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002117
Walter Dörwald28256f22003-01-19 16:59:20 +00002118 # Error handling (decoding)
Walter Dörwald67e83882007-05-05 12:26:27 +00002119 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
2120 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
2121 self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
2122 self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03002123 self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002124
Walter Dörwald28256f22003-01-19 16:59:20 +00002125 # Error handling (unknown character names)
Guido van Rossum39478e82007-08-27 17:23:59 +00002126 self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002127
Walter Dörwald28256f22003-01-19 16:59:20 +00002128 # Error handling (truncated escape sequence)
Guido van Rossum9c627722007-08-27 18:31:48 +00002129 self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00002130
Guido van Rossum9c627722007-08-27 18:31:48 +00002131 self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
2132 self.assertRaises(TypeError, str, b"hello", "test.unicode2")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002133 self.assertRaises(TypeError, "hello".encode, "test.unicode1")
2134 self.assertRaises(TypeError, "hello".encode, "test.unicode2")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00002135
Walter Dörwald28256f22003-01-19 16:59:20 +00002136 # Error handling (wrong arguments)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002137 self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002138
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02002139 # Error handling (lone surrogate in
2140 # _PyUnicode_TransformDecimalAndSpaceToASCII())
2141 self.assertRaises(ValueError, int, "\ud800")
2142 self.assertRaises(ValueError, int, "\udf00")
2143 self.assertRaises(ValueError, float, "\ud800")
2144 self.assertRaises(ValueError, float, "\udf00")
2145 self.assertRaises(ValueError, complex, "\ud800")
2146 self.assertRaises(ValueError, complex, "\udf00")
Guido van Rossum97064862000-04-10 13:52:48 +00002147
Walter Dörwald28256f22003-01-19 16:59:20 +00002148 def test_codecs(self):
2149 # Encoding
Walter Dörwald67e83882007-05-05 12:26:27 +00002150 self.assertEqual('hello'.encode('ascii'), b'hello')
2151 self.assertEqual('hello'.encode('utf-7'), b'hello')
2152 self.assertEqual('hello'.encode('utf-8'), b'hello')
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002153 self.assertEqual('hello'.encode('utf-8'), b'hello')
Walter Dörwald67e83882007-05-05 12:26:27 +00002154 self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
2155 self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
2156 self.assertEqual('hello'.encode('latin-1'), b'hello')
Guido van Rossum97064862000-04-10 13:52:48 +00002157
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002158 # Default encoding is utf-8
2159 self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
2160
Walter Dörwald28256f22003-01-19 16:59:20 +00002161 # Roundtrip safety for BMP (just the first 1024 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002162 for c in range(1024):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002163 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002164 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
2165 'utf-16-be', 'raw_unicode_escape',
Inada Naoki6a16b182019-03-18 15:44:11 +09002166 'unicode_escape'):
2167 self.assertEqual(str(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002168
Walter Dörwald28256f22003-01-19 16:59:20 +00002169 # Roundtrip safety for BMP (just the first 256 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002170 for c in range(256):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002171 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002172 for encoding in ('latin-1',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002173 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002174
Walter Dörwald28256f22003-01-19 16:59:20 +00002175 # Roundtrip safety for BMP (just the first 128 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002176 for c in range(128):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002177 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002178 for encoding in ('ascii',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002179 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002180
Walter Dörwald28256f22003-01-19 16:59:20 +00002181 # Roundtrip safety for non-BMP (just a few chars)
Victor Stinner040e16e2011-11-15 22:44:05 +01002182 with warnings.catch_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01002183 u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
2184 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
Inada Naoki6a16b182019-03-18 15:44:11 +09002185 'raw_unicode_escape', 'unicode_escape'):
Victor Stinner040e16e2011-11-15 22:44:05 +01002186 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002187
Antoine Pitrou51f66482011-11-11 13:35:44 +01002188 # UTF-8 must be roundtrip safe for all code points
2189 # (except surrogates, which are forbidden).
2190 u = ''.join(map(chr, list(range(0, 0xd800)) +
Ezio Melotti40dc9192011-11-11 17:00:46 +02002191 list(range(0xe000, 0x110000))))
Walter Dörwald28256f22003-01-19 16:59:20 +00002192 for encoding in ('utf-8',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002193 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002194
Walter Dörwald28256f22003-01-19 16:59:20 +00002195 def test_codecs_charmap(self):
2196 # 0-127
Guido van Rossum805365e2007-05-07 22:24:25 +00002197 s = bytes(range(128))
Walter Dörwald28256f22003-01-19 16:59:20 +00002198 for encoding in (
Andrew Kuchlingad8156e2013-11-10 13:44:30 -05002199 'cp037', 'cp1026', 'cp273',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00002200 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2201 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02002202 'cp863', 'cp865', 'cp866', 'cp1125',
Walter Dörwald28256f22003-01-19 16:59:20 +00002203 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2204 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03002205 'iso8859_7', 'iso8859_9',
2206 'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'latin_1',
Walter Dörwald28256f22003-01-19 16:59:20 +00002207 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002208
Walter Dörwald28256f22003-01-19 16:59:20 +00002209 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2210 'cp1256', 'cp1257', 'cp1258',
2211 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002212
Walter Dörwald28256f22003-01-19 16:59:20 +00002213 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2214 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +00002215
Walter Dörwald28256f22003-01-19 16:59:20 +00002216 ### These have undefined mappings:
2217 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +00002218
Walter Dörwald28256f22003-01-19 16:59:20 +00002219 ### These fail the round-trip:
2220 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +00002221
Walter Dörwald28256f22003-01-19 16:59:20 +00002222 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002223 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002224
Walter Dörwald28256f22003-01-19 16:59:20 +00002225 # 128-255
Guido van Rossum805365e2007-05-07 22:24:25 +00002226 s = bytes(range(128, 256))
Walter Dörwald28256f22003-01-19 16:59:20 +00002227 for encoding in (
Andrew Kuchlingad8156e2013-11-10 13:44:30 -05002228 'cp037', 'cp1026', 'cp273',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00002229 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2230 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02002231 'cp863', 'cp865', 'cp866', 'cp1125',
Walter Dörwald28256f22003-01-19 16:59:20 +00002232 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2233 'iso8859_2', 'iso8859_4', 'iso8859_5',
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03002234 'iso8859_9', 'koi8_r', 'koi8_u', 'latin_1',
Walter Dörwald28256f22003-01-19 16:59:20 +00002235 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +00002236
Walter Dörwald28256f22003-01-19 16:59:20 +00002237 ### These have undefined mappings:
2238 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2239 #'cp1256', 'cp1257', 'cp1258',
2240 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03002241 #'iso8859_3', 'iso8859_6', 'iso8859_7', 'koi8_t', 'kz1048',
Walter Dörwald28256f22003-01-19 16:59:20 +00002242 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +00002243
Walter Dörwald28256f22003-01-19 16:59:20 +00002244 ### These fail the round-trip:
2245 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +00002246
Walter Dörwald28256f22003-01-19 16:59:20 +00002247 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002248 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002249
Walter Dörwald28256f22003-01-19 16:59:20 +00002250 def test_concatenation(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002251 self.assertEqual(("abc" "def"), "abcdef")
2252 self.assertEqual(("abc" "def"), "abcdef")
2253 self.assertEqual(("abc" "def"), "abcdef")
2254 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2255 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +00002256
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002257 def test_ucs4(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002258 x = '\U00100000'
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002259 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
2260 self.assertEqual(x, y)
2261
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002262 y = br'\U00100000'
2263 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2264 self.assertEqual(x, y)
2265 y = br'\U00010000'
2266 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2267 self.assertEqual(x, y)
Christian Heimesfe337bf2008-03-23 21:54:12 +00002268
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002269 try:
2270 br'\U11111111'.decode("raw-unicode-escape")
2271 except UnicodeDecodeError as e:
2272 self.assertEqual(e.start, 0)
2273 self.assertEqual(e.end, 10)
2274 else:
2275 self.fail("Should have raised UnicodeDecodeError")
Christian Heimesfe337bf2008-03-23 21:54:12 +00002276
Brett Cannonc3647ac2005-04-26 03:45:26 +00002277 def test_conversion(self):
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002278 # Make sure __str__() works properly
2279 class ObjectToStr:
Brett Cannonc3647ac2005-04-26 03:45:26 +00002280 def __str__(self):
2281 return "foo"
2282
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002283 class StrSubclassToStr(str):
Guido van Rossum98297ee2007-11-06 21:34:58 +00002284 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002285 return "foo"
Brett Cannonc3647ac2005-04-26 03:45:26 +00002286
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002287 class StrSubclassToStrSubclass(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002288 def __new__(cls, content=""):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002289 return str.__new__(cls, 2*content)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002290 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002291 return self
2292
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002293 self.assertEqual(str(ObjectToStr()), "foo")
2294 self.assertEqual(str(StrSubclassToStr("bar")), "foo")
2295 s = str(StrSubclassToStrSubclass("foo"))
2296 self.assertEqual(s, "foofoo")
2297 self.assertIs(type(s), StrSubclassToStrSubclass)
Serhiy Storchaka15095802015-11-25 15:47:01 +02002298 s = StrSubclass(StrSubclassToStrSubclass("foo"))
2299 self.assertEqual(s, "foofoo")
2300 self.assertIs(type(s), StrSubclass)
Brett Cannonc3647ac2005-04-26 03:45:26 +00002301
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002302 def test_unicode_repr(self):
2303 class s1:
2304 def __repr__(self):
2305 return '\\n'
2306
2307 class s2:
2308 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002309 return '\\n'
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002310
2311 self.assertEqual(repr(s1()), '\\n')
2312 self.assertEqual(repr(s2()), '\\n')
2313
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002314 def test_printable_repr(self):
2315 self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
Martin v. Löwisbaecd722010-10-11 22:42:28 +00002316 self.assertEqual(repr('\U00014000'), "'\\U00014000'") # nonprintable
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002317
Zachary Ware9fe6d862013-12-08 00:20:35 -06002318 # This test only affects 32-bit platforms because expandtabs can only take
2319 # an int as the max value, not a 64-bit C long. If expandtabs is changed
2320 # to take a 64-bit long, this test should apply to all platforms.
2321 @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4,
2322 'only applies to 32-bit platforms')
Guido van Rossumcd16bf62007-06-13 18:07:49 +00002323 def test_expandtabs_overflows_gracefully(self):
Christian Heimesa37d4c62007-12-04 23:02:19 +00002324 self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002325
Victor Stinner1d972ad2011-10-07 13:31:46 +02002326 @support.cpython_only
Antoine Pitroue19aa382011-10-04 16:04:01 +02002327 def test_expandtabs_optimization(self):
2328 s = 'abc'
2329 self.assertIs(s.expandtabs(), s)
2330
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00002331 def test_raiseMemError(self):
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002332 if struct.calcsize('P') == 8:
2333 # 64 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02002334 ascii_struct_size = 48
2335 compact_struct_size = 72
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002336 else:
2337 # 32 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02002338 ascii_struct_size = 24
2339 compact_struct_size = 36
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002340
2341 for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
2342 code = ord(char)
2343 if code < 0x100:
2344 char_size = 1 # sizeof(Py_UCS1)
2345 struct_size = ascii_struct_size
2346 elif code < 0x10000:
2347 char_size = 2 # sizeof(Py_UCS2)
2348 struct_size = compact_struct_size
2349 else:
2350 char_size = 4 # sizeof(Py_UCS4)
2351 struct_size = compact_struct_size
2352 # Note: sys.maxsize is half of the actual max allocation because of
Martin v. Löwis287eca62011-09-28 10:03:28 +02002353 # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
2354 # be allocatable, given enough memory.
2355 maxlen = ((sys.maxsize - struct_size) // char_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002356 alloc = lambda: char * maxlen
2357 self.assertRaises(MemoryError, alloc)
2358 self.assertRaises(MemoryError, alloc)
Antoine Pitrou3db3e872008-08-17 17:06:51 +00002359
Victor Stinner808fc0a2010-03-22 12:50:40 +00002360 def test_format_subclass(self):
2361 class S(str):
2362 def __str__(self):
2363 return '__str__ overridden'
2364 s = S('xxx')
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002365 self.assertEqual("%s" % s, '__str__ overridden')
2366 self.assertEqual("{}".format(s), '__str__ overridden')
Victor Stinner808fc0a2010-03-22 12:50:40 +00002367
Serhiy Storchaka63b5b6f2016-10-02 21:16:38 +03002368 def test_subclass_add(self):
2369 class S(str):
2370 def __add__(self, o):
2371 return "3"
2372 self.assertEqual(S("4") + S("5"), "3")
2373 class S(str):
2374 def __iadd__(self, o):
2375 return "3"
2376 s = S("1")
2377 s += "4"
2378 self.assertEqual(s, "3")
2379
2380 def test_getnewargs(self):
2381 text = 'abc'
2382 args = text.__getnewargs__()
2383 self.assertIsNot(args[0], text)
2384 self.assertEqual(args[0], text)
2385 self.assertEqual(len(args), 1)
2386
Inada Naoki6a16b182019-03-18 15:44:11 +09002387 @support.cpython_only
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03002388 @support.requires_legacy_unicode_capi
Serhiy Storchaka63b5b6f2016-10-02 21:16:38 +03002389 def test_resize(self):
Inada Naoki6a16b182019-03-18 15:44:11 +09002390 from _testcapi import getargs_u
Serhiy Storchaka63b5b6f2016-10-02 21:16:38 +03002391 for length in range(1, 100, 7):
2392 # generate a fresh string (refcount=1)
2393 text = 'a' * length + 'b'
2394
Inada Naoki6a16b182019-03-18 15:44:11 +09002395 # fill wstr internal field
Inada Naoki91a639a2021-02-22 22:11:48 +09002396 with self.assertWarns(DeprecationWarning):
2397 abc = getargs_u(text)
Inada Naoki6a16b182019-03-18 15:44:11 +09002398 self.assertEqual(abc, text)
Serhiy Storchaka63b5b6f2016-10-02 21:16:38 +03002399
Inada Naoki6a16b182019-03-18 15:44:11 +09002400 # resize text: wstr field must be cleared and then recomputed
2401 text += 'c'
Inada Naoki91a639a2021-02-22 22:11:48 +09002402 with self.assertWarns(DeprecationWarning):
2403 abcdef = getargs_u(text)
Inada Naoki6a16b182019-03-18 15:44:11 +09002404 self.assertNotEqual(abc, abcdef)
2405 self.assertEqual(abcdef, text)
Serhiy Storchaka63b5b6f2016-10-02 21:16:38 +03002406
2407 def test_compare(self):
2408 # Issue #17615
2409 N = 10
2410 ascii = 'a' * N
2411 ascii2 = 'z' * N
2412 latin = '\x80' * N
2413 latin2 = '\xff' * N
2414 bmp = '\u0100' * N
2415 bmp2 = '\uffff' * N
2416 astral = '\U00100000' * N
2417 astral2 = '\U0010ffff' * N
2418 strings = (
2419 ascii, ascii2,
2420 latin, latin2,
2421 bmp, bmp2,
2422 astral, astral2)
2423 for text1, text2 in itertools.combinations(strings, 2):
2424 equal = (text1 is text2)
2425 self.assertEqual(text1 == text2, equal)
2426 self.assertEqual(text1 != text2, not equal)
2427
2428 if equal:
2429 self.assertTrue(text1 <= text2)
2430 self.assertTrue(text1 >= text2)
2431
2432 # text1 is text2: duplicate strings to skip the "str1 == str2"
2433 # optimization in unicode_compare_eq() and really compare
2434 # character per character
2435 copy1 = duplicate_string(text1)
2436 copy2 = duplicate_string(text2)
2437 self.assertIsNot(copy1, copy2)
2438
2439 self.assertTrue(copy1 == copy2)
2440 self.assertFalse(copy1 != copy2)
2441
2442 self.assertTrue(copy1 <= copy2)
2443 self.assertTrue(copy2 >= copy2)
2444
2445 self.assertTrue(ascii < ascii2)
2446 self.assertTrue(ascii < latin)
2447 self.assertTrue(ascii < bmp)
2448 self.assertTrue(ascii < astral)
2449 self.assertFalse(ascii >= ascii2)
2450 self.assertFalse(ascii >= latin)
2451 self.assertFalse(ascii >= bmp)
2452 self.assertFalse(ascii >= astral)
2453
2454 self.assertFalse(latin < ascii)
2455 self.assertTrue(latin < latin2)
2456 self.assertTrue(latin < bmp)
2457 self.assertTrue(latin < astral)
2458 self.assertTrue(latin >= ascii)
2459 self.assertFalse(latin >= latin2)
2460 self.assertFalse(latin >= bmp)
2461 self.assertFalse(latin >= astral)
2462
2463 self.assertFalse(bmp < ascii)
2464 self.assertFalse(bmp < latin)
2465 self.assertTrue(bmp < bmp2)
2466 self.assertTrue(bmp < astral)
2467 self.assertTrue(bmp >= ascii)
2468 self.assertTrue(bmp >= latin)
2469 self.assertFalse(bmp >= bmp2)
2470 self.assertFalse(bmp >= astral)
2471
2472 self.assertFalse(astral < ascii)
2473 self.assertFalse(astral < latin)
2474 self.assertFalse(astral < bmp2)
2475 self.assertTrue(astral < astral2)
2476 self.assertTrue(astral >= ascii)
2477 self.assertTrue(astral >= latin)
2478 self.assertTrue(astral >= bmp2)
2479 self.assertFalse(astral >= astral2)
2480
2481 def test_free_after_iterating(self):
2482 support.check_free_after_iterating(self, iter, str)
2483 support.check_free_after_iterating(self, reversed, str)
2484
Victor Stinner22eb6892019-06-26 00:51:05 +02002485 def test_check_encoding_errors(self):
2486 # bpo-37388: str(bytes) and str.decode() must check encoding and errors
2487 # arguments in dev mode
2488 encodings = ('ascii', 'utf8', 'latin1')
2489 invalid = 'Boom, Shaka Laka, Boom!'
2490 code = textwrap.dedent(f'''
2491 import sys
2492 encodings = {encodings!r}
2493
2494 for data in (b'', b'short string'):
2495 try:
2496 str(data, encoding={invalid!r})
2497 except LookupError:
2498 pass
2499 else:
2500 sys.exit(21)
2501
2502 try:
2503 str(data, errors={invalid!r})
2504 except LookupError:
2505 pass
2506 else:
2507 sys.exit(22)
2508
2509 for encoding in encodings:
2510 try:
2511 str(data, encoding, errors={invalid!r})
2512 except LookupError:
2513 pass
2514 else:
2515 sys.exit(22)
2516
2517 for data in ('', 'short string'):
2518 try:
2519 data.encode(encoding={invalid!r})
2520 except LookupError:
2521 pass
2522 else:
2523 sys.exit(23)
2524
2525 try:
2526 data.encode(errors={invalid!r})
2527 except LookupError:
2528 pass
2529 else:
2530 sys.exit(24)
2531
2532 for encoding in encodings:
2533 try:
2534 data.encode(encoding, errors={invalid!r})
2535 except LookupError:
2536 pass
2537 else:
2538 sys.exit(24)
2539
2540 sys.exit(10)
2541 ''')
2542 proc = assert_python_failure('-X', 'dev', '-c', code)
2543 self.assertEqual(proc.rc, 10, proc)
2544
Serhiy Storchaka63b5b6f2016-10-02 21:16:38 +03002545
2546class CAPITest(unittest.TestCase):
2547
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002548 # Test PyUnicode_FromFormat()
Victor Stinner1205f272010-09-11 00:54:47 +00002549 def test_from_format(self):
Hai Shideb01622020-07-06 20:29:49 +08002550 import_helper.import_module('ctypes')
Victor Stinner15a11362012-10-06 23:48:20 +02002551 from ctypes import (
Ronald Oussoren41761932020-11-08 10:05:27 +01002552 c_char_p,
Victor Stinner15a11362012-10-06 23:48:20 +02002553 pythonapi, py_object, sizeof,
Victor Stinner6d970f42011-03-02 00:04:25 +00002554 c_int, c_long, c_longlong, c_ssize_t,
Victor Stinner15a11362012-10-06 23:48:20 +02002555 c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556 name = "PyUnicode_FromFormat"
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002557 _PyUnicode_FromFormat = getattr(pythonapi, name)
Ronald Oussoren41761932020-11-08 10:05:27 +01002558 _PyUnicode_FromFormat.argtypes = (c_char_p,)
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002559 _PyUnicode_FromFormat.restype = py_object
2560
2561 def PyUnicode_FromFormat(format, *args):
2562 cargs = tuple(
2563 py_object(arg) if isinstance(arg, str) else arg
2564 for arg in args)
2565 return _PyUnicode_FromFormat(format, *cargs)
Victor Stinner1205f272010-09-11 00:54:47 +00002566
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002567 def check_format(expected, format, *args):
2568 text = PyUnicode_FromFormat(format, *args)
2569 self.assertEqual(expected, text)
2570
Victor Stinner1205f272010-09-11 00:54:47 +00002571 # ascii format, non-ascii argument
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002572 check_format('ascii\x7f=unicode\xe9',
2573 b'ascii\x7f=%U', 'unicode\xe9')
Victor Stinner1205f272010-09-11 00:54:47 +00002574
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002575 # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
2576 # raises an error
Ezio Melottied3a7d22010-12-01 02:32:32 +00002577 self.assertRaisesRegex(ValueError,
R David Murray44b548d2016-09-08 13:59:53 -04002578 r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
Victor Stinner4c7db312010-09-12 07:51:18 +00002579 'string, got a non-ASCII byte: 0xe9$',
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002580 PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00002581
Victor Stinner96865452011-03-01 23:44:09 +00002582 # test "%c"
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002583 check_format('\uabcd',
2584 b'%c', c_int(0xabcd))
2585 check_format('\U0010ffff',
2586 b'%c', c_int(0x10ffff))
Serhiy Storchaka8eeae212013-06-23 20:12:14 +03002587 with self.assertRaises(OverflowError):
2588 PyUnicode_FromFormat(b'%c', c_int(0x110000))
Serhiy Storchaka31b1c8b2013-06-12 09:20:44 +03002589 # Issue #18183
Serhiy Storchakaf15ffe02013-06-12 09:28:20 +03002590 check_format('\U00010000\U00100000',
2591 b'%c%c', c_int(0x10000), c_int(0x100000))
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002592
Victor Stinner96865452011-03-01 23:44:09 +00002593 # test "%"
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002594 check_format('%',
2595 b'%')
2596 check_format('%',
2597 b'%%')
2598 check_format('%s',
2599 b'%%s')
2600 check_format('[%]',
2601 b'[%%]')
2602 check_format('%abc',
2603 b'%%%s', b'abc')
2604
2605 # truncated string
2606 check_format('abc',
2607 b'%.3s', b'abcdef')
2608 check_format('abc[\ufffd',
2609 b'%.5s', 'abc[\u20ac]'.encode('utf8'))
2610 check_format("'\\u20acABC'",
2611 b'%A', '\u20acABC')
2612 check_format("'\\u20",
2613 b'%.5A', '\u20acABCDEF')
2614 check_format("'\u20acABC'",
2615 b'%R', '\u20acABC')
2616 check_format("'\u20acA",
2617 b'%.3R', '\u20acABCDEF')
2618 check_format('\u20acAB',
2619 b'%.3S', '\u20acABCDEF')
2620 check_format('\u20acAB',
2621 b'%.3U', '\u20acABCDEF')
2622 check_format('\u20acAB',
2623 b'%.3V', '\u20acABCDEF', None)
2624 check_format('abc[\ufffd',
2625 b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
2626
2627 # following tests comes from #7330
2628 # test width modifier and precision modifier with %S
2629 check_format("repr= abc",
2630 b'repr=%5S', 'abc')
2631 check_format("repr=ab",
2632 b'repr=%.2S', 'abc')
2633 check_format("repr= ab",
2634 b'repr=%5.2S', 'abc')
2635
2636 # test width modifier and precision modifier with %R
2637 check_format("repr= 'abc'",
2638 b'repr=%8R', 'abc')
2639 check_format("repr='ab",
2640 b'repr=%.3R', 'abc')
2641 check_format("repr= 'ab",
2642 b'repr=%5.3R', 'abc')
2643
2644 # test width modifier and precision modifier with %A
2645 check_format("repr= 'abc'",
2646 b'repr=%8A', 'abc')
2647 check_format("repr='ab",
2648 b'repr=%.3A', 'abc')
2649 check_format("repr= 'ab",
2650 b'repr=%5.3A', 'abc')
2651
2652 # test width modifier and precision modifier with %s
2653 check_format("repr= abc",
2654 b'repr=%5s', b'abc')
2655 check_format("repr=ab",
2656 b'repr=%.2s', b'abc')
2657 check_format("repr= ab",
2658 b'repr=%5.2s', b'abc')
2659
2660 # test width modifier and precision modifier with %U
2661 check_format("repr= abc",
2662 b'repr=%5U', 'abc')
2663 check_format("repr=ab",
2664 b'repr=%.2U', 'abc')
2665 check_format("repr= ab",
2666 b'repr=%5.2U', 'abc')
2667
2668 # test width modifier and precision modifier with %V
2669 check_format("repr= abc",
2670 b'repr=%5V', 'abc', b'123')
2671 check_format("repr=ab",
2672 b'repr=%.2V', 'abc', b'123')
2673 check_format("repr= ab",
2674 b'repr=%5.2V', 'abc', b'123')
2675 check_format("repr= 123",
2676 b'repr=%5V', None, b'123')
2677 check_format("repr=12",
2678 b'repr=%.2V', None, b'123')
2679 check_format("repr= 12",
2680 b'repr=%5.2V', None, b'123')
Victor Stinner96865452011-03-01 23:44:09 +00002681
Victor Stinner6d970f42011-03-02 00:04:25 +00002682 # test integer formats (%i, %d, %u)
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002683 check_format('010',
2684 b'%03i', c_int(10))
2685 check_format('0010',
2686 b'%0.4i', c_int(10))
2687 check_format('-123',
2688 b'%i', c_int(-123))
2689 check_format('-123',
2690 b'%li', c_long(-123))
2691 check_format('-123',
2692 b'%lli', c_longlong(-123))
2693 check_format('-123',
2694 b'%zi', c_ssize_t(-123))
Victor Stinner96865452011-03-01 23:44:09 +00002695
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002696 check_format('-123',
2697 b'%d', c_int(-123))
2698 check_format('-123',
2699 b'%ld', c_long(-123))
2700 check_format('-123',
2701 b'%lld', c_longlong(-123))
2702 check_format('-123',
2703 b'%zd', c_ssize_t(-123))
Victor Stinner96865452011-03-01 23:44:09 +00002704
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002705 check_format('123',
2706 b'%u', c_uint(123))
2707 check_format('123',
2708 b'%lu', c_ulong(123))
2709 check_format('123',
2710 b'%llu', c_ulonglong(123))
2711 check_format('123',
2712 b'%zu', c_size_t(123))
Victor Stinner6d970f42011-03-02 00:04:25 +00002713
Victor Stinner15a11362012-10-06 23:48:20 +02002714 # test long output
2715 min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
2716 max_longlong = -min_longlong - 1
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002717 check_format(str(min_longlong),
2718 b'%lld', c_longlong(min_longlong))
2719 check_format(str(max_longlong),
2720 b'%lld', c_longlong(max_longlong))
Victor Stinner15a11362012-10-06 23:48:20 +02002721 max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002722 check_format(str(max_ulonglong),
2723 b'%llu', c_ulonglong(max_ulonglong))
Victor Stinner15a11362012-10-06 23:48:20 +02002724 PyUnicode_FromFormat(b'%p', c_void_p(-1))
2725
Victor Stinnere215d962012-10-06 23:03:36 +02002726 # test padding (width and/or precision)
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002727 check_format('123'.rjust(10, '0'),
2728 b'%010i', c_int(123))
2729 check_format('123'.rjust(100),
2730 b'%100i', c_int(123))
2731 check_format('123'.rjust(100, '0'),
2732 b'%.100i', c_int(123))
2733 check_format('123'.rjust(80, '0').rjust(100),
2734 b'%100.80i', c_int(123))
Victor Stinnere215d962012-10-06 23:03:36 +02002735
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002736 check_format('123'.rjust(10, '0'),
2737 b'%010u', c_uint(123))
2738 check_format('123'.rjust(100),
2739 b'%100u', c_uint(123))
2740 check_format('123'.rjust(100, '0'),
2741 b'%.100u', c_uint(123))
2742 check_format('123'.rjust(80, '0').rjust(100),
2743 b'%100.80u', c_uint(123))
Victor Stinnere215d962012-10-06 23:03:36 +02002744
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002745 check_format('123'.rjust(10, '0'),
2746 b'%010x', c_int(0x123))
2747 check_format('123'.rjust(100),
2748 b'%100x', c_int(0x123))
2749 check_format('123'.rjust(100, '0'),
2750 b'%.100x', c_int(0x123))
2751 check_format('123'.rjust(80, '0').rjust(100),
2752 b'%100.80x', c_int(0x123))
Victor Stinnere215d962012-10-06 23:03:36 +02002753
Victor Stinner6d970f42011-03-02 00:04:25 +00002754 # test %A
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002755 check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
2756 b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
Victor Stinner9a909002010-10-18 20:59:24 +00002757
Victor Stinner6d970f42011-03-02 00:04:25 +00002758 # test %V
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002759 check_format('repr=abc',
2760 b'repr=%V', 'abc', b'xyz')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002761
2762 # Test string decode from parameter of %s using utf-8.
2763 # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
2764 # '\u4eba\u6c11'
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002765 check_format('repr=\u4eba\u6c11',
2766 b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002767
2768 #Test replace error handler.
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002769 check_format('repr=abc\ufffd',
2770 b'repr=%V', None, b'abc\xff')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002771
Victor Stinner6d970f42011-03-02 00:04:25 +00002772 # not supported: copy the raw format string. these tests are just here
Martin Panter2f9171d2016-12-18 01:23:09 +00002773 # to check for crashes and should not be considered as specifications
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002774 check_format('%s',
2775 b'%1%s', b'abc')
2776 check_format('%1abc',
2777 b'%1abc')
2778 check_format('%+i',
2779 b'%+i', c_int(10))
2780 check_format('%.%s',
2781 b'%.%s', b'abc')
Victor Stinner6d970f42011-03-02 00:04:25 +00002782
Serhiy Storchaka44cc4822019-01-12 09:22:29 +02002783 # Issue #33817: empty strings
2784 check_format('',
2785 b'')
2786 check_format('',
2787 b'%s', b'')
2788
Victor Stinner1c24bd02010-10-02 11:03:13 +00002789 # Test PyUnicode_AsWideChar()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002790 @support.cpython_only
Victor Stinner1c24bd02010-10-02 11:03:13 +00002791 def test_aswidechar(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002792 from _testcapi import unicode_aswidechar
Hai Shideb01622020-07-06 20:29:49 +08002793 import_helper.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002794 from ctypes import c_wchar, sizeof
2795
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002796 wchar, size = unicode_aswidechar('abcdef', 2)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002797 self.assertEqual(size, 2)
2798 self.assertEqual(wchar, 'ab')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002799
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002800 wchar, size = unicode_aswidechar('abc', 3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002801 self.assertEqual(size, 3)
2802 self.assertEqual(wchar, 'abc')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002803
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002804 wchar, size = unicode_aswidechar('abc', 4)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002805 self.assertEqual(size, 3)
2806 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002807
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002808 wchar, size = unicode_aswidechar('abc', 10)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002809 self.assertEqual(size, 3)
2810 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002811
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002812 wchar, size = unicode_aswidechar('abc\0def', 20)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002813 self.assertEqual(size, 7)
2814 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002815
Victor Stinner5593d8a2010-10-02 11:11:27 +00002816 nonbmp = chr(0x10ffff)
2817 if sizeof(c_wchar) == 2:
2818 buflen = 3
2819 nchar = 2
2820 else: # sizeof(c_wchar) == 4
2821 buflen = 2
2822 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002823 wchar, size = unicode_aswidechar(nonbmp, buflen)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002824 self.assertEqual(size, nchar)
2825 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00002826
Victor Stinner1c24bd02010-10-02 11:03:13 +00002827 # Test PyUnicode_AsWideCharString()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002828 @support.cpython_only
Victor Stinner1c24bd02010-10-02 11:03:13 +00002829 def test_aswidecharstring(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002830 from _testcapi import unicode_aswidecharstring
Hai Shideb01622020-07-06 20:29:49 +08002831 import_helper.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002832 from ctypes import c_wchar, sizeof
2833
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002834 wchar, size = unicode_aswidecharstring('abc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00002835 self.assertEqual(size, 3)
2836 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002837
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002838 wchar, size = unicode_aswidecharstring('abc\0def')
Ezio Melottib3aedd42010-11-20 19:04:17 +00002839 self.assertEqual(size, 7)
2840 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002841
Victor Stinner5593d8a2010-10-02 11:11:27 +00002842 nonbmp = chr(0x10ffff)
2843 if sizeof(c_wchar) == 2:
2844 nchar = 2
2845 else: # sizeof(c_wchar) == 4
2846 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002847 wchar, size = unicode_aswidecharstring(nonbmp)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002848 self.assertEqual(size, nchar)
2849 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00002850
Serhiy Storchakacc164232016-10-02 21:29:26 +03002851 # Test PyUnicode_AsUCS4()
2852 @support.cpython_only
2853 def test_asucs4(self):
2854 from _testcapi import unicode_asucs4
2855 for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600',
2856 'a\ud800b\udfffc', '\ud834\udd1e']:
2857 l = len(s)
Serhiy Storchaka1f21eaa2019-09-01 12:16:51 +03002858 self.assertEqual(unicode_asucs4(s, l, True), s+'\0')
2859 self.assertEqual(unicode_asucs4(s, l, False), s+'\uffff')
2860 self.assertEqual(unicode_asucs4(s, l+1, True), s+'\0\uffff')
2861 self.assertEqual(unicode_asucs4(s, l+1, False), s+'\0\uffff')
2862 self.assertRaises(SystemError, unicode_asucs4, s, l-1, True)
2863 self.assertRaises(SystemError, unicode_asucs4, s, l-2, False)
Serhiy Storchakacc164232016-10-02 21:29:26 +03002864 s = '\0'.join([s, s])
Serhiy Storchaka1f21eaa2019-09-01 12:16:51 +03002865 self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0')
2866 self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff')
Serhiy Storchakacc164232016-10-02 21:29:26 +03002867
Hai Shi5623ac82019-07-20 02:56:23 -05002868 # Test PyUnicode_AsUTF8()
2869 @support.cpython_only
2870 def test_asutf8(self):
2871 from _testcapi import unicode_asutf8
2872
2873 bmp = '\u0100'
2874 bmp2 = '\uffff'
2875 nonbmp = chr(0x10ffff)
2876
2877 self.assertEqual(unicode_asutf8(bmp), b'\xc4\x80')
2878 self.assertEqual(unicode_asutf8(bmp2), b'\xef\xbf\xbf')
2879 self.assertEqual(unicode_asutf8(nonbmp), b'\xf4\x8f\xbf\xbf')
2880 self.assertRaises(UnicodeEncodeError, unicode_asutf8, 'a\ud800b\udfffc')
2881
2882 # Test PyUnicode_AsUTF8AndSize()
2883 @support.cpython_only
2884 def test_asutf8andsize(self):
2885 from _testcapi import unicode_asutf8andsize
2886
2887 bmp = '\u0100'
2888 bmp2 = '\uffff'
2889 nonbmp = chr(0x10ffff)
2890
2891 self.assertEqual(unicode_asutf8andsize(bmp), (b'\xc4\x80', 2))
2892 self.assertEqual(unicode_asutf8andsize(bmp2), (b'\xef\xbf\xbf', 3))
2893 self.assertEqual(unicode_asutf8andsize(nonbmp), (b'\xf4\x8f\xbf\xbf', 4))
2894 self.assertRaises(UnicodeEncodeError, unicode_asutf8andsize, 'a\ud800b\udfffc')
2895
Xiang Zhangb2110682016-12-20 22:52:33 +08002896 # Test PyUnicode_FindChar()
2897 @support.cpython_only
2898 def test_findchar(self):
2899 from _testcapi import unicode_findchar
2900
2901 for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1":
2902 for i, ch in enumerate(str):
2903 self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i)
2904 self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i)
2905
2906 str = "!>_<!"
2907 self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1)
2908 self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1)
2909 # start < end
2910 self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4)
2911 self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4)
2912 # start >= end
2913 self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1)
2914 self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1)
2915 # negative
2916 self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0)
2917 self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0)
2918
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03002919 # Test PyUnicode_CopyCharacters()
2920 @support.cpython_only
2921 def test_copycharacters(self):
2922 from _testcapi import unicode_copycharacters
2923
2924 strings = [
2925 'abcde', '\xa1\xa2\xa3\xa4\xa5',
2926 '\u4f60\u597d\u4e16\u754c\uff01',
2927 '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604'
2928 ]
2929
2930 for idx, from_ in enumerate(strings):
2931 # wide -> narrow: exceed maxchar limitation
2932 for to in strings[:idx]:
2933 self.assertRaises(
2934 SystemError,
2935 unicode_copycharacters, to, 0, from_, 0, 5
2936 )
2937 # same kind
2938 for from_start in range(5):
2939 self.assertEqual(
2940 unicode_copycharacters(from_, 0, from_, from_start, 5),
2941 (from_[from_start:from_start+5].ljust(5, '\0'),
2942 5-from_start)
2943 )
2944 for to_start in range(5):
2945 self.assertEqual(
2946 unicode_copycharacters(from_, to_start, from_, to_start, 5),
2947 (from_[to_start:to_start+5].rjust(5, '\0'),
2948 5-to_start)
2949 )
2950 # narrow -> wide
2951 # Tests omitted since this creates invalid strings.
2952
2953 s = strings[0]
2954 self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5)
2955 self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5)
2956 self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5)
2957 self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5)
2958 self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5)
2959 self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1)
2960 self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)
2961
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002962 @support.cpython_only
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03002963 @support.requires_legacy_unicode_capi
Victor Stinner42bf7752011-11-21 22:52:58 +01002964 def test_encode_decimal(self):
2965 from _testcapi import unicode_encodedecimal
Zackery Spytz8aabfa82021-03-06 23:12:35 -07002966 with warnings_helper.check_warnings():
2967 warnings.simplefilter('ignore', DeprecationWarning)
2968 self.assertEqual(unicode_encodedecimal('123'),
2969 b'123')
2970 self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
2971 b'3.14')
2972 self.assertEqual(unicode_encodedecimal(
2973 "\N{EM SPACE}3.14\N{EN SPACE}"), b' 3.14 ')
2974 self.assertRaises(UnicodeEncodeError,
2975 unicode_encodedecimal, "123\u20ac", "strict")
2976 self.assertRaisesRegex(
2977 ValueError,
2978 "^'decimal' codec can't encode character",
2979 unicode_encodedecimal, "123\u20ac", "replace")
Victor Stinner42bf7752011-11-21 22:52:58 +01002980
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002981 @support.cpython_only
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03002982 @support.requires_legacy_unicode_capi
Victor Stinner42bf7752011-11-21 22:52:58 +01002983 def test_transform_decimal(self):
2984 from _testcapi import unicode_transformdecimaltoascii as transform_decimal
Zackery Spytz8aabfa82021-03-06 23:12:35 -07002985 with warnings_helper.check_warnings():
2986 warnings.simplefilter('ignore', DeprecationWarning)
2987 self.assertEqual(transform_decimal('123'),
2988 '123')
2989 self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
2990 '3.14')
2991 self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2992 "\N{EM SPACE}3.14\N{EN SPACE}")
2993 self.assertEqual(transform_decimal('123\u20ac'),
2994 '123\u20ac')
Victor Stinner42bf7752011-11-21 22:52:58 +01002995
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02002996 @support.cpython_only
2997 def test_pep393_utf8_caching_bug(self):
2998 # Issue #25709: Problem with string concatenation and utf-8 cache
2999 from _testcapi import getargs_s_hash
3000 for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
3001 s = ''
3002 for i in range(5):
3003 # Due to CPython specific optimization the 's' string can be
3004 # resized in-place.
3005 s += chr(k)
3006 # Parsing with the "s#" format code calls indirectly
3007 # PyUnicode_AsUTF8AndSize() which creates the UTF-8
3008 # encoded string cached in the Unicode object.
3009 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
3010 # Check that the second call returns the same result
3011 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
3012
Eric Smitha1eac722011-01-29 11:15:35 +00003013class StringModuleTest(unittest.TestCase):
3014 def test_formatter_parser(self):
3015 def parse(format):
3016 return list(_string.formatter_parser(format))
3017
3018 formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
3019 self.assertEqual(formatter, [
3020 ('prefix ', '2', '', 's'),
3021 ('xxx', '0', '^+10.3f', None),
3022 ('', 'obj.attr', '', 's'),
3023 (' ', 'z[0]', '10', 's'),
3024 ])
3025
3026 formatter = parse("prefix {} suffix")
3027 self.assertEqual(formatter, [
3028 ('prefix ', '', '', None),
3029 (' suffix', None, None, None),
3030 ])
3031
3032 formatter = parse("str")
3033 self.assertEqual(formatter, [
3034 ('str', None, None, None),
3035 ])
3036
3037 formatter = parse("")
3038 self.assertEqual(formatter, [])
3039
3040 formatter = parse("{0}")
3041 self.assertEqual(formatter, [
3042 ('', '0', '', None),
3043 ])
3044
3045 self.assertRaises(TypeError, _string.formatter_parser, 1)
3046
3047 def test_formatter_field_name_split(self):
3048 def split(name):
3049 items = list(_string.formatter_field_name_split(name))
3050 items[1] = list(items[1])
3051 return items
3052 self.assertEqual(split("obj"), ["obj", []])
3053 self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
3054 self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
3055 self.assertEqual(split("obj.arg[key1][key2]"), [
3056 "obj",
3057 [(True, 'arg'),
3058 (False, 'key1'),
3059 (False, 'key2'),
3060 ]])
3061 self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
3062
3063
Walter Dörwald28256f22003-01-19 16:59:20 +00003064if __name__ == "__main__":
Ezio Melotti0dceb562013-01-10 07:43:26 +02003065 unittest.main()