blob: d485bc7ede2b923182f89672148f4b98d74f3aee [file] [log] [blame]
Guido van Rossuma831cac2000-03-10 23:23:21 +00001""" Test script for the Unicode implementation.
2
Guido van Rossuma831cac2000-03-10 23:23:21 +00003Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03007"""
Victor Stinner040e16e2011-11-15 22:44:05 +01008import _string
Guido van Rossum98297ee2007-11-06 21:34:58 +00009import codecs
Victor Stinner9fc59812013-04-08 22:34:43 +020010import itertools
Ethan Furman9ab74802014-03-21 06:38:46 -070011import operator
Guido van Rossum98297ee2007-11-06 21:34:58 +000012import struct
13import sys
Victor Stinner22eb6892019-06-26 00:51:05 +020014import textwrap
Greg Price6bccbe72019-08-14 04:05:19 -070015import unicodedata
Guido van Rossum98297ee2007-11-06 21:34:58 +000016import unittest
17import warnings
Hai Shideb01622020-07-06 20:29:49 +080018from test.support import import_helper
19from test.support import warnings_helper
Benjamin Petersonee8712c2008-05-20 21:35:26 +000020from test import support, string_tests
Victor Stinner22eb6892019-06-26 00:51:05 +020021from test.support.script_helper import assert_python_failure
Guido van Rossuma831cac2000-03-10 23:23:21 +000022
Neal Norwitz430f68b2005-11-24 22:00:56 +000023# Error handling (bad decoder return)
24def search_function(encoding):
25 def decode1(input, errors="strict"):
26 return 42 # not a tuple
27 def encode1(input, errors="strict"):
28 return 42 # not a tuple
29 def encode2(input, errors="strict"):
30 return (42, 42) # no unicode
31 def decode2(input, errors="strict"):
32 return (42, 42) # no unicode
33 if encoding=="test.unicode1":
34 return (encode1, decode1, None, None)
35 elif encoding=="test.unicode2":
36 return (encode2, decode2, None, None)
37 else:
38 return None
39codecs.register(search_function)
40
Victor Stinner9fc59812013-04-08 22:34:43 +020041def duplicate_string(text):
42 """
43 Try to get a fresh clone of the specified text:
44 new object with a reference count of 1.
45
46 This is a best-effort: latin1 single letters and the empty
47 string ('') are singletons and cannot be cloned.
48 """
49 return text.encode().decode()
50
Serhiy Storchaka15095802015-11-25 15:47:01 +020051class StrSubclass(str):
52 pass
53
Brett Cannon226b2302010-03-20 22:22:22 +000054class UnicodeTest(string_tests.CommonTest,
55 string_tests.MixinStrUnicodeUserStringTest,
Ezio Melotti0dceb562013-01-10 07:43:26 +020056 string_tests.MixinStrUnicodeTest,
57 unittest.TestCase):
Brett Cannon226b2302010-03-20 22:22:22 +000058
Guido van Rossumef87d6e2007-05-02 19:09:54 +000059 type2test = str
Walter Dörwald0fd583c2003-02-21 12:53:50 +000060
61 def checkequalnofix(self, result, object, methodname, *args):
62 method = getattr(object, methodname)
63 realresult = method(*args)
64 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000065 self.assertTrue(type(realresult) is type(result))
Walter Dörwald0fd583c2003-02-21 12:53:50 +000066
67 # if the original is returned make sure that
68 # this doesn't happen with subclasses
69 if realresult is object:
Guido van Rossumef87d6e2007-05-02 19:09:54 +000070 class usub(str):
Walter Dörwald0fd583c2003-02-21 12:53:50 +000071 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000072 return 'usub(%r)' % str.__repr__(self)
Walter Dörwald0fd583c2003-02-21 12:53:50 +000073 object = usub(object)
74 method = getattr(object, methodname)
75 realresult = method(*args)
76 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000077 self.assertTrue(object is not realresult)
Guido van Rossume4874ae2001-09-21 15:36:41 +000078
Jeremy Hylton504de6b2003-10-06 05:08:26 +000079 def test_literals(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000080 self.assertEqual('\xff', '\u00ff')
81 self.assertEqual('\uffff', '\U0000ffff')
Guido van Rossum36e0a922007-07-20 04:05:57 +000082 self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
83 self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
84 self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
Benjamin Petersoncd76c272008-04-05 15:09:30 +000085 # raw strings should not have unicode escapes
Florent Xiclunaa87b3832010-09-13 02:28:18 +000086 self.assertNotEqual(r"\u0020", " ")
Jeremy Hylton504de6b2003-10-06 05:08:26 +000087
Georg Brandl559e5d72008-06-11 18:37:52 +000088 def test_ascii(self):
89 if not sys.platform.startswith('java'):
90 # Test basic sanity of repr()
91 self.assertEqual(ascii('abc'), "'abc'")
92 self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
93 self.assertEqual(ascii('ab\\'), "'ab\\\\'")
94 self.assertEqual(ascii('\\c'), "'\\\\c'")
95 self.assertEqual(ascii('\\'), "'\\\\'")
96 self.assertEqual(ascii('\n'), "'\\n'")
97 self.assertEqual(ascii('\r'), "'\\r'")
98 self.assertEqual(ascii('\t'), "'\\t'")
99 self.assertEqual(ascii('\b'), "'\\x08'")
100 self.assertEqual(ascii("'\""), """'\\'"'""")
101 self.assertEqual(ascii("'\""), """'\\'"'""")
102 self.assertEqual(ascii("'"), '''"'"''')
103 self.assertEqual(ascii('"'), """'"'""")
104 latin1repr = (
105 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
106 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
107 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
108 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
109 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
110 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
111 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
112 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
113 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
114 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
115 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
116 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
117 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
118 "\\xfe\\xff'")
119 testrepr = ascii(''.join(map(chr, range(256))))
120 self.assertEqual(testrepr, latin1repr)
121 # Test ascii works on wide unicode escapes without overflow.
122 self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
123 ascii("\U00010000" * 39 + "\uffff" * 4096))
124
125 class WrongRepr:
126 def __repr__(self):
127 return b'byte-repr'
128 self.assertRaises(TypeError, ascii, WrongRepr())
129
Walter Dörwald28256f22003-01-19 16:59:20 +0000130 def test_repr(self):
131 if not sys.platform.startswith('java'):
132 # Test basic sanity of repr()
Walter Dörwald67e83882007-05-05 12:26:27 +0000133 self.assertEqual(repr('abc'), "'abc'")
134 self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
135 self.assertEqual(repr('ab\\'), "'ab\\\\'")
136 self.assertEqual(repr('\\c'), "'\\\\c'")
137 self.assertEqual(repr('\\'), "'\\\\'")
138 self.assertEqual(repr('\n'), "'\\n'")
139 self.assertEqual(repr('\r'), "'\\r'")
140 self.assertEqual(repr('\t'), "'\\t'")
141 self.assertEqual(repr('\b'), "'\\x08'")
142 self.assertEqual(repr("'\""), """'\\'"'""")
143 self.assertEqual(repr("'\""), """'\\'"'""")
144 self.assertEqual(repr("'"), '''"'"''')
145 self.assertEqual(repr('"'), """'"'""")
Walter Dörwald28256f22003-01-19 16:59:20 +0000146 latin1repr = (
Walter Dörwald67e83882007-05-05 12:26:27 +0000147 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
Walter Dörwald28256f22003-01-19 16:59:20 +0000148 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
149 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
150 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
151 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
152 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
Georg Brandl559e5d72008-06-11 18:37:52 +0000153 "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
154 "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
155 "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
156 "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
157 "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
158 "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
159 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
160 "\xfe\xff'")
Guido van Rossum805365e2007-05-07 22:24:25 +0000161 testrepr = repr(''.join(map(chr, range(256))))
Walter Dörwald28256f22003-01-19 16:59:20 +0000162 self.assertEqual(testrepr, latin1repr)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000163 # Test repr works on wide unicode escapes without overflow.
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000164 self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
165 repr("\U00010000" * 39 + "\uffff" * 4096))
Walter Dörwald28256f22003-01-19 16:59:20 +0000166
Georg Brandl559e5d72008-06-11 18:37:52 +0000167 class WrongRepr:
168 def __repr__(self):
169 return b'byte-repr'
170 self.assertRaises(TypeError, repr, WrongRepr())
171
Guido van Rossum49d6b072006-08-17 21:11:47 +0000172 def test_iterators(self):
173 # Make sure unicode objects have an __iter__ method
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000174 it = "\u1111\u2222\u3333".__iter__()
175 self.assertEqual(next(it), "\u1111")
176 self.assertEqual(next(it), "\u2222")
177 self.assertEqual(next(it), "\u3333")
Georg Brandla18af4e2007-04-21 15:47:16 +0000178 self.assertRaises(StopIteration, next, it)
Guido van Rossum49d6b072006-08-17 21:11:47 +0000179
Walter Dörwald28256f22003-01-19 16:59:20 +0000180 def test_count(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000181 string_tests.CommonTest.test_count(self)
182 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000183 self.checkequalnofix(3, 'aaa', 'count', 'a')
184 self.checkequalnofix(0, 'aaa', 'count', 'b')
185 self.checkequalnofix(3, 'aaa', 'count', 'a')
186 self.checkequalnofix(0, 'aaa', 'count', 'b')
187 self.checkequalnofix(0, 'aaa', 'count', 'b')
188 self.checkequalnofix(1, 'aaa', 'count', 'a', -1)
189 self.checkequalnofix(3, 'aaa', 'count', 'a', -10)
190 self.checkequalnofix(2, 'aaa', 'count', 'a', 0, -1)
191 self.checkequalnofix(0, 'aaa', 'count', 'a', 0, -10)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200192 # test mixed kinds
193 self.checkequal(10, '\u0102' + 'a' * 10, 'count', 'a')
194 self.checkequal(10, '\U00100304' + 'a' * 10, 'count', 'a')
195 self.checkequal(10, '\U00100304' + '\u0102' * 10, 'count', '\u0102')
196 self.checkequal(0, 'a' * 10, 'count', '\u0102')
197 self.checkequal(0, 'a' * 10, 'count', '\U00100304')
198 self.checkequal(0, '\u0102' * 10, 'count', '\U00100304')
199 self.checkequal(10, '\u0102' + 'a_' * 10, 'count', 'a_')
200 self.checkequal(10, '\U00100304' + 'a_' * 10, 'count', 'a_')
201 self.checkequal(10, '\U00100304' + '\u0102_' * 10, 'count', '\u0102_')
202 self.checkequal(0, 'a' * 10, 'count', 'a\u0102')
203 self.checkequal(0, 'a' * 10, 'count', 'a\U00100304')
204 self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000205
Walter Dörwald28256f22003-01-19 16:59:20 +0000206 def test_find(self):
Antoine Pitrouc0bbe7d2011-10-08 22:41:35 +0200207 string_tests.CommonTest.test_find(self)
Antoine Pitrou2c3b2302011-10-11 20:29:21 +0200208 # test implementation details of the memchr fast path
209 self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102')
210 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201')
211 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120')
212 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220')
213 self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304')
214 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204')
215 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004')
216 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000217 self.checkequalnofix(0, 'abcdefghiabc', 'find', 'abc')
218 self.checkequalnofix(9, 'abcdefghiabc', 'find', 'abc', 1)
219 self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000220
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000221 self.assertRaises(TypeError, 'hello'.find)
222 self.assertRaises(TypeError, 'hello'.find, 42)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200223 # test mixed kinds
224 self.checkequal(100, '\u0102' * 100 + 'a', 'find', 'a')
225 self.checkequal(100, '\U00100304' * 100 + 'a', 'find', 'a')
226 self.checkequal(100, '\U00100304' * 100 + '\u0102', 'find', '\u0102')
227 self.checkequal(-1, 'a' * 100, 'find', '\u0102')
228 self.checkequal(-1, 'a' * 100, 'find', '\U00100304')
229 self.checkequal(-1, '\u0102' * 100, 'find', '\U00100304')
230 self.checkequal(100, '\u0102' * 100 + 'a_', 'find', 'a_')
231 self.checkequal(100, '\U00100304' * 100 + 'a_', 'find', 'a_')
232 self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'find', '\u0102_')
233 self.checkequal(-1, 'a' * 100, 'find', 'a\u0102')
234 self.checkequal(-1, 'a' * 100, 'find', 'a\U00100304')
235 self.checkequal(-1, '\u0102' * 100, 'find', '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000236
Walter Dörwald28256f22003-01-19 16:59:20 +0000237 def test_rfind(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000238 string_tests.CommonTest.test_rfind(self)
Antoine Pitrou2c3b2302011-10-11 20:29:21 +0200239 # test implementation details of the memrchr fast path
240 self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102')
241 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201')
242 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120')
243 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220')
244 self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304')
245 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204')
246 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000247 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000248 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', 'abc')
249 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
250 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200251 # test mixed kinds
252 self.checkequal(0, 'a' + '\u0102' * 100, 'rfind', 'a')
253 self.checkequal(0, 'a' + '\U00100304' * 100, 'rfind', 'a')
254 self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rfind', '\u0102')
255 self.checkequal(-1, 'a' * 100, 'rfind', '\u0102')
256 self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304')
257 self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304')
258 self.checkequal(0, '_a' + '\u0102' * 100, 'rfind', '_a')
259 self.checkequal(0, '_a' + '\U00100304' * 100, 'rfind', '_a')
260 self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rfind', '_\u0102')
261 self.checkequal(-1, 'a' * 100, 'rfind', '\u0102a')
262 self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304a')
263 self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304\u0102')
Guido van Rossum8b264542000-12-19 02:22:31 +0000264
Walter Dörwald28256f22003-01-19 16:59:20 +0000265 def test_index(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000266 string_tests.CommonTest.test_index(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000267 self.checkequalnofix(0, 'abcdefghiabc', 'index', '')
268 self.checkequalnofix(3, 'abcdefghiabc', 'index', 'def')
269 self.checkequalnofix(0, 'abcdefghiabc', 'index', 'abc')
270 self.checkequalnofix(9, 'abcdefghiabc', 'index', 'abc', 1)
271 self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
272 self.assertRaises(ValueError, 'abcdefghiab'.index, 'abc', 1)
273 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', 8)
274 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', -1)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200275 # test mixed kinds
276 self.checkequal(100, '\u0102' * 100 + 'a', 'index', 'a')
277 self.checkequal(100, '\U00100304' * 100 + 'a', 'index', 'a')
278 self.checkequal(100, '\U00100304' * 100 + '\u0102', 'index', '\u0102')
279 self.assertRaises(ValueError, ('a' * 100).index, '\u0102')
280 self.assertRaises(ValueError, ('a' * 100).index, '\U00100304')
281 self.assertRaises(ValueError, ('\u0102' * 100).index, '\U00100304')
282 self.checkequal(100, '\u0102' * 100 + 'a_', 'index', 'a_')
283 self.checkequal(100, '\U00100304' * 100 + 'a_', 'index', 'a_')
284 self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'index', '\u0102_')
285 self.assertRaises(ValueError, ('a' * 100).index, 'a\u0102')
286 self.assertRaises(ValueError, ('a' * 100).index, 'a\U00100304')
287 self.assertRaises(ValueError, ('\u0102' * 100).index, '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000288
Walter Dörwald28256f22003-01-19 16:59:20 +0000289 def test_rindex(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000290 string_tests.CommonTest.test_rindex(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000291 self.checkequalnofix(12, 'abcdefghiabc', 'rindex', '')
292 self.checkequalnofix(3, 'abcdefghiabc', 'rindex', 'def')
293 self.checkequalnofix(9, 'abcdefghiabc', 'rindex', 'abc')
294 self.checkequalnofix(0, 'abcdefghiabc', 'rindex', 'abc', 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000295
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000296 self.assertRaises(ValueError, 'abcdefghiabc'.rindex, 'hib')
297 self.assertRaises(ValueError, 'defghiabc'.rindex, 'def', 1)
298 self.assertRaises(ValueError, 'defghiabc'.rindex, 'abc', 0, -1)
299 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, 8)
300 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, -1)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200301 # test mixed kinds
302 self.checkequal(0, 'a' + '\u0102' * 100, 'rindex', 'a')
303 self.checkequal(0, 'a' + '\U00100304' * 100, 'rindex', 'a')
304 self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rindex', '\u0102')
305 self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102')
306 self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304')
307 self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304')
308 self.checkequal(0, '_a' + '\u0102' * 100, 'rindex', '_a')
309 self.checkequal(0, '_a' + '\U00100304' * 100, 'rindex', '_a')
310 self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rindex', '_\u0102')
311 self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102a')
312 self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304a')
313 self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304\u0102')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000314
Georg Brandlceee0772007-11-27 23:48:05 +0000315 def test_maketrans_translate(self):
316 # these work with plain translate()
317 self.checkequalnofix('bbbc', 'abababc', 'translate',
318 {ord('a'): None})
319 self.checkequalnofix('iiic', 'abababc', 'translate',
320 {ord('a'): None, ord('b'): ord('i')})
321 self.checkequalnofix('iiix', 'abababc', 'translate',
322 {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
323 self.checkequalnofix('c', 'abababc', 'translate',
324 {ord('a'): None, ord('b'): ''})
325 self.checkequalnofix('xyyx', 'xzx', 'translate',
326 {ord('z'): 'yy'})
Victor Stinner5a29f252014-04-05 00:17:51 +0200327
Georg Brandlceee0772007-11-27 23:48:05 +0000328 # this needs maketrans()
329 self.checkequalnofix('abababc', 'abababc', 'translate',
330 {'b': '<i>'})
331 tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
332 self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
333 # test alternative way of calling maketrans()
334 tbl = self.type2test.maketrans('abc', 'xyz', 'd')
335 self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
336
Victor Stinner5a29f252014-04-05 00:17:51 +0200337 # various tests switching from ASCII to latin1 or the opposite;
338 # same length, remove a letter, or replace with a longer string.
339 self.assertEqual("[a]".translate(str.maketrans('a', 'X')),
340 "[X]")
341 self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})),
342 "[X]")
343 self.assertEqual("[a]".translate(str.maketrans({'a': None})),
344 "[]")
345 self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})),
346 "[XXX]")
347 self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})),
348 "[\xe9]")
Victor Stinner33798672016-03-01 21:59:58 +0100349 self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '123'})),
350 "x123")
351 self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '\xe9'})),
352 "x\xe9")
353
354 # test non-ASCII (don't take the fast-path)
Victor Stinner5a29f252014-04-05 00:17:51 +0200355 self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})),
356 "[<\xe9>]")
357 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})),
358 "[a]")
359 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})),
360 "[]")
Victor Stinner33798672016-03-01 21:59:58 +0100361 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': '123'})),
362 "[123]")
363 self.assertEqual("[a\xe9]".translate(str.maketrans({'a': '<\u20ac>'})),
364 "[<\u20ac>\xe9]")
Victor Stinner5a29f252014-04-05 00:17:51 +0200365
Victor Stinner4ff33af2014-04-05 11:56:37 +0200366 # invalid Unicode characters
367 invalid_char = 0x10ffff+1
368 for before in "a\xe9\u20ac\U0010ffff":
369 mapping = str.maketrans({before: invalid_char})
370 text = "[%s]" % before
371 self.assertRaises(ValueError, text.translate, mapping)
372
373 # errors
Georg Brandlceee0772007-11-27 23:48:05 +0000374 self.assertRaises(TypeError, self.type2test.maketrans)
375 self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
376 self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
377 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
378 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
379 self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
380 self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000381
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000382 self.assertRaises(TypeError, 'hello'.translate)
Walter Dörwald67e83882007-05-05 12:26:27 +0000383 self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000384
Walter Dörwald28256f22003-01-19 16:59:20 +0000385 def test_split(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000386 string_tests.CommonTest.test_split(self)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000387
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200388 # test mixed kinds
389 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
390 left *= 9
391 right *= 9
392 for delim in ('c', '\u0102', '\U00010302'):
393 self.checkequal([left + right],
394 left + right, 'split', delim)
395 self.checkequal([left, right],
396 left + delim + right, 'split', delim)
397 self.checkequal([left + right],
398 left + right, 'split', delim * 2)
399 self.checkequal([left, right],
400 left + delim * 2 + right, 'split', delim *2)
401
402 def test_rsplit(self):
403 string_tests.CommonTest.test_rsplit(self)
404 # test mixed kinds
405 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
406 left *= 9
407 right *= 9
408 for delim in ('c', '\u0102', '\U00010302'):
409 self.checkequal([left + right],
410 left + right, 'rsplit', delim)
411 self.checkequal([left, right],
412 left + delim + right, 'rsplit', delim)
413 self.checkequal([left + right],
414 left + right, 'rsplit', delim * 2)
415 self.checkequal([left, right],
416 left + delim * 2 + right, 'rsplit', delim *2)
417
418 def test_partition(self):
419 string_tests.MixinStrUnicodeUserStringTest.test_partition(self)
420 # test mixed kinds
Serhiy Storchaka48070c12015-03-29 19:21:02 +0300421 self.checkequal(('ABCDEFGH', '', ''), 'ABCDEFGH', 'partition', '\u4200')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200422 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
423 left *= 9
424 right *= 9
425 for delim in ('c', '\u0102', '\U00010302'):
426 self.checkequal((left + right, '', ''),
427 left + right, 'partition', delim)
428 self.checkequal((left, delim, right),
429 left + delim + right, 'partition', delim)
430 self.checkequal((left + right, '', ''),
431 left + right, 'partition', delim * 2)
432 self.checkequal((left, delim * 2, right),
433 left + delim * 2 + right, 'partition', delim * 2)
434
435 def test_rpartition(self):
436 string_tests.MixinStrUnicodeUserStringTest.test_rpartition(self)
437 # test mixed kinds
Serhiy Storchaka48070c12015-03-29 19:21:02 +0300438 self.checkequal(('', '', 'ABCDEFGH'), 'ABCDEFGH', 'rpartition', '\u4200')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200439 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
440 left *= 9
441 right *= 9
442 for delim in ('c', '\u0102', '\U00010302'):
443 self.checkequal(('', '', left + right),
444 left + right, 'rpartition', delim)
445 self.checkequal((left, delim, right),
446 left + delim + right, 'rpartition', delim)
447 self.checkequal(('', '', left + right),
448 left + right, 'rpartition', delim * 2)
449 self.checkequal((left, delim * 2, right),
450 left + delim * 2 + right, 'rpartition', delim * 2)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000451
Walter Dörwald28256f22003-01-19 16:59:20 +0000452 def test_join(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000453 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000454
Guido van Rossumf1044292007-09-27 18:01:22 +0000455 class MyWrapper:
456 def __init__(self, sval): self.sval = sval
457 def __str__(self): return self.sval
458
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000459 # mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000460 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
461 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
462 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
463 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
464 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
465 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
466 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
Guido van Rossum98297ee2007-11-06 21:34:58 +0000467 self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
468 self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
469 self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
470 self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
Marc-André Lemburge5034372000-08-08 08:04:29 +0000471
Martin Panterb71c0952017-01-12 11:54:59 +0000472 @unittest.skipIf(sys.maxsize > 2**32,
473 'needs too much memory on a 64-bit platform')
474 def test_join_overflow(self):
475 size = int(sys.maxsize**0.5) + 1
476 seq = ('A' * size,) * size
477 self.assertRaises(OverflowError, ''.join, seq)
478
Walter Dörwald28256f22003-01-19 16:59:20 +0000479 def test_replace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000480 string_tests.CommonTest.test_replace(self)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000481
Walter Dörwald28256f22003-01-19 16:59:20 +0000482 # method call forwarded from str implementation because of unicode argument
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000483 self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
484 self.assertRaises(TypeError, 'replace'.replace, "r", 42)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200485 # test mixed kinds
486 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
487 left *= 9
488 right *= 9
489 for delim in ('c', '\u0102', '\U00010302'):
490 for repl in ('d', '\u0103', '\U00010303'):
491 self.checkequal(left + right,
492 left + right, 'replace', delim, repl)
493 self.checkequal(left + repl + right,
494 left + delim + right,
495 'replace', delim, repl)
496 self.checkequal(left + right,
497 left + right, 'replace', delim * 2, repl)
498 self.checkequal(left + repl + right,
499 left + delim * 2 + right,
500 'replace', delim * 2, repl)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000501
Victor Stinner59de0ee2011-10-07 10:01:28 +0200502 @support.cpython_only
503 def test_replace_id(self):
Victor Stinner1d972ad2011-10-07 13:31:46 +0200504 pattern = 'abc'
505 text = 'abc def'
506 self.assertIs(text.replace(pattern, pattern), text)
Victor Stinner59de0ee2011-10-07 10:01:28 +0200507
Guido van Rossum98297ee2007-11-06 21:34:58 +0000508 def test_bytes_comparison(self):
Hai Shideb01622020-07-06 20:29:49 +0800509 with warnings_helper.check_warnings():
Brett Cannon226b2302010-03-20 22:22:22 +0000510 warnings.simplefilter('ignore', BytesWarning)
511 self.assertEqual('abc' == b'abc', False)
512 self.assertEqual('abc' != b'abc', True)
513 self.assertEqual('abc' == bytearray(b'abc'), False)
514 self.assertEqual('abc' != bytearray(b'abc'), True)
Brett Cannon40430012007-10-22 20:24:51 +0000515
Walter Dörwald28256f22003-01-19 16:59:20 +0000516 def test_comparison(self):
517 # Comparisons:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000518 self.assertEqual('abc', 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000519 self.assertTrue('abcd' > 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000520 self.assertTrue('abc' < 'abcd')
Walter Dörwald28256f22003-01-19 16:59:20 +0000521
522 if 0:
523 # Move these tests to a Unicode collation module test...
524 # Testing UTF-16 code point order comparisons...
525
526 # No surrogates, no fixup required.
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000527 self.assertTrue('\u0061' < '\u20ac')
Walter Dörwald28256f22003-01-19 16:59:20 +0000528 # Non surrogate below surrogate value, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000529 self.assertTrue('\u0061' < '\ud800\udc02')
Walter Dörwald28256f22003-01-19 16:59:20 +0000530
531 # Non surrogate above surrogate value, fixup required
532 def test_lecmp(s, s2):
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000533 self.assertTrue(s < s2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000534
535 def test_fixup(s):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000536 s2 = '\ud800\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000537 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000538 s2 = '\ud900\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000539 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000540 s2 = '\uda00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000541 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000542 s2 = '\udb00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000543 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000544 s2 = '\ud800\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000545 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000546 s2 = '\ud900\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000547 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000548 s2 = '\uda00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000549 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000550 s2 = '\udb00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000551 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000552 s2 = '\ud800\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000553 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000554 s2 = '\ud900\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000555 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000556 s2 = '\uda00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000557 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000558 s2 = '\udb00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000559 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000560 s2 = '\ud800\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000561 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000562 s2 = '\ud900\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000563 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000564 s2 = '\uda00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000565 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000566 s2 = '\udb00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000567 test_lecmp(s, s2)
568
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000569 test_fixup('\ue000')
570 test_fixup('\uff61')
Walter Dörwald28256f22003-01-19 16:59:20 +0000571
572 # Surrogates on both sides, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000573 self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
Walter Dörwald28256f22003-01-19 16:59:20 +0000574
Walter Dörwald28256f22003-01-19 16:59:20 +0000575 def test_islower(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000576 super().test_islower()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000577 self.checkequalnofix(False, '\u1FFc', 'islower')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500578 self.assertFalse('\u2167'.islower())
579 self.assertTrue('\u2177'.islower())
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300580 # non-BMP, uppercase
581 self.assertFalse('\U00010401'.islower())
582 self.assertFalse('\U00010427'.islower())
583 # non-BMP, lowercase
584 self.assertTrue('\U00010429'.islower())
585 self.assertTrue('\U0001044E'.islower())
586 # non-BMP, non-cased
587 self.assertFalse('\U0001F40D'.islower())
588 self.assertFalse('\U0001F46F'.islower())
Walter Dörwald28256f22003-01-19 16:59:20 +0000589
590 def test_isupper(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000591 super().test_isupper()
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000592 if not sys.platform.startswith('java'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000593 self.checkequalnofix(False, '\u1FFc', 'isupper')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500594 self.assertTrue('\u2167'.isupper())
595 self.assertFalse('\u2177'.isupper())
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300596 # non-BMP, uppercase
597 self.assertTrue('\U00010401'.isupper())
598 self.assertTrue('\U00010427'.isupper())
599 # non-BMP, lowercase
600 self.assertFalse('\U00010429'.isupper())
601 self.assertFalse('\U0001044E'.isupper())
602 # non-BMP, non-cased
603 self.assertFalse('\U0001F40D'.isupper())
604 self.assertFalse('\U0001F46F'.isupper())
Walter Dörwald28256f22003-01-19 16:59:20 +0000605
606 def test_istitle(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000607 super().test_istitle()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000608 self.checkequalnofix(True, '\u1FFc', 'istitle')
609 self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
Walter Dörwald28256f22003-01-19 16:59:20 +0000610
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300611 # non-BMP, uppercase + lowercase
612 self.assertTrue('\U00010401\U00010429'.istitle())
613 self.assertTrue('\U00010427\U0001044E'.istitle())
614 # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
615 for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
616 self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
617
Walter Dörwald28256f22003-01-19 16:59:20 +0000618 def test_isspace(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000619 super().test_isspace()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000620 self.checkequalnofix(True, '\u2000', 'isspace')
621 self.checkequalnofix(True, '\u200a', 'isspace')
622 self.checkequalnofix(False, '\u2014', 'isspace')
Greg Price6bccbe72019-08-14 04:05:19 -0700623 # There are no non-BMP whitespace chars as of Unicode 12.
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300624 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
625 '\U0001F40D', '\U0001F46F']:
626 self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
627
Greg Price6bccbe72019-08-14 04:05:19 -0700628 @support.requires_resource('cpu')
629 def test_isspace_invariant(self):
630 for codepoint in range(sys.maxunicode + 1):
631 char = chr(codepoint)
632 bidirectional = unicodedata.bidirectional(char)
633 category = unicodedata.category(char)
634 self.assertEqual(char.isspace(),
635 (bidirectional in ('WS', 'B', 'S')
636 or category == 'Zs'))
637
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300638 def test_isalnum(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000639 super().test_isalnum()
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300640 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
641 '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
642 self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
Walter Dörwald28256f22003-01-19 16:59:20 +0000643
644 def test_isalpha(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000645 super().test_isalpha()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000646 self.checkequalnofix(True, '\u1FFc', 'isalpha')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300647 # non-BMP, cased
648 self.assertTrue('\U00010401'.isalpha())
649 self.assertTrue('\U00010427'.isalpha())
650 self.assertTrue('\U00010429'.isalpha())
651 self.assertTrue('\U0001044E'.isalpha())
652 # non-BMP, non-cased
653 self.assertFalse('\U0001F40D'.isalpha())
654 self.assertFalse('\U0001F46F'.isalpha())
Walter Dörwald28256f22003-01-19 16:59:20 +0000655
INADA Naokia49ac992018-01-27 14:06:21 +0900656 def test_isascii(self):
657 super().test_isascii()
658 self.assertFalse("\u20ac".isascii())
659 self.assertFalse("\U0010ffff".isascii())
660
Walter Dörwald28256f22003-01-19 16:59:20 +0000661 def test_isdecimal(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000662 self.checkequalnofix(False, '', 'isdecimal')
663 self.checkequalnofix(False, 'a', 'isdecimal')
664 self.checkequalnofix(True, '0', 'isdecimal')
665 self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
666 self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
667 self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
668 self.checkequalnofix(True, '0123456789', 'isdecimal')
669 self.checkequalnofix(False, '0123456789a', 'isdecimal')
Walter Dörwald28256f22003-01-19 16:59:20 +0000670
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000671 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000672
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300673 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
674 '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
675 self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
676 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
677 self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
678
Walter Dörwald28256f22003-01-19 16:59:20 +0000679 def test_isdigit(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000680 super().test_isdigit()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000681 self.checkequalnofix(True, '\u2460', 'isdigit')
682 self.checkequalnofix(False, '\xbc', 'isdigit')
683 self.checkequalnofix(True, '\u0660', 'isdigit')
Walter Dörwald28256f22003-01-19 16:59:20 +0000684
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300685 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
686 '\U0001F40D', '\U0001F46F', '\U00011065']:
687 self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
688 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
689 self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
690
Walter Dörwald28256f22003-01-19 16:59:20 +0000691 def test_isnumeric(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000692 self.checkequalnofix(False, '', 'isnumeric')
693 self.checkequalnofix(False, 'a', 'isnumeric')
694 self.checkequalnofix(True, '0', 'isnumeric')
695 self.checkequalnofix(True, '\u2460', 'isnumeric')
696 self.checkequalnofix(True, '\xbc', 'isnumeric')
697 self.checkequalnofix(True, '\u0660', 'isnumeric')
698 self.checkequalnofix(True, '0123456789', 'isnumeric')
699 self.checkequalnofix(False, '0123456789a', 'isnumeric')
Walter Dörwald28256f22003-01-19 16:59:20 +0000700
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000701 self.assertRaises(TypeError, "abc".isnumeric, 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000702
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300703 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
704 '\U0001F40D', '\U0001F46F']:
705 self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
706 for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
707 '\U000104A0', '\U0001F107']:
708 self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
709
Martin v. Löwis47383402007-08-15 07:32:56 +0000710 def test_isidentifier(self):
711 self.assertTrue("a".isidentifier())
712 self.assertTrue("Z".isidentifier())
713 self.assertTrue("_".isidentifier())
714 self.assertTrue("b0".isidentifier())
715 self.assertTrue("bc".isidentifier())
716 self.assertTrue("b_".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000717 self.assertTrue("µ".isidentifier())
Benjamin Petersonf413b802011-08-12 22:17:18 -0500718 self.assertTrue("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000719
720 self.assertFalse(" ".isidentifier())
721 self.assertFalse("[".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000722 self.assertFalse("©".isidentifier())
Georg Brandld52429f2008-07-04 15:55:02 +0000723 self.assertFalse("0".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000724
Serhiy Storchaka5650e762020-05-12 16:18:00 +0300725 @support.cpython_only
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +0300726 @support.requires_legacy_unicode_capi
Serhiy Storchaka5650e762020-05-12 16:18:00 +0300727 def test_isidentifier_legacy(self):
728 import _testcapi
729 u = '𝖀𝖓𝖎𝖈𝖔𝖉𝖊'
730 self.assertTrue(u.isidentifier())
Hai Shideb01622020-07-06 20:29:49 +0800731 with warnings_helper.check_warnings():
Inada Naoki038dd0f2020-06-30 15:26:56 +0900732 warnings.simplefilter('ignore', DeprecationWarning)
733 self.assertTrue(_testcapi.unicode_legacy_string(u).isidentifier())
Serhiy Storchaka5650e762020-05-12 16:18:00 +0300734
Georg Brandl559e5d72008-06-11 18:37:52 +0000735 def test_isprintable(self):
736 self.assertTrue("".isprintable())
Benjamin Peterson09832742009-03-26 17:15:46 +0000737 self.assertTrue(" ".isprintable())
Georg Brandl559e5d72008-06-11 18:37:52 +0000738 self.assertTrue("abcdefg".isprintable())
739 self.assertFalse("abcdefg\n".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000740 # some defined Unicode character
741 self.assertTrue("\u0374".isprintable())
742 # undefined character
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +0000743 self.assertFalse("\u0378".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000744 # single surrogate character
Georg Brandl559e5d72008-06-11 18:37:52 +0000745 self.assertFalse("\ud800".isprintable())
746
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300747 self.assertTrue('\U0001F46F'.isprintable())
748 self.assertFalse('\U000E0020'.isprintable())
749
750 def test_surrogates(self):
751 for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
752 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
753 self.assertTrue(s.islower())
754 self.assertFalse(s.isupper())
755 self.assertFalse(s.istitle())
756 for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
757 'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
758 self.assertFalse(s.islower())
759 self.assertTrue(s.isupper())
760 self.assertTrue(s.istitle())
761
762 for meth_name in ('islower', 'isupper', 'istitle'):
763 meth = getattr(str, meth_name)
764 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
765 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
766
767 for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
768 'isdecimal', 'isnumeric',
769 'isidentifier', 'isprintable'):
770 meth = getattr(str, meth_name)
771 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
772 'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
773 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
774 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
775
776
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300777 def test_lower(self):
778 string_tests.CommonTest.test_lower(self)
779 self.assertEqual('\U00010427'.lower(), '\U0001044F')
780 self.assertEqual('\U00010427\U00010427'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300781 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300782 self.assertEqual('\U00010427\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300783 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300784 self.assertEqual('X\U00010427x\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300785 'x\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500786 self.assertEqual('fi'.lower(), 'fi')
787 self.assertEqual('\u0130'.lower(), '\u0069\u0307')
788 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
789 self.assertEqual('\u03a3'.lower(), '\u03c3')
790 self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
791 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
792 self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
793 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
794 self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
795 self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
796 self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
797 self.assertEqual('\u2177'.lower(), '\u2177')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300798
Benjamin Petersond5890c82012-01-14 13:23:30 -0500799 def test_casefold(self):
800 self.assertEqual('hello'.casefold(), 'hello')
801 self.assertEqual('hELlo'.casefold(), 'hello')
802 self.assertEqual('ß'.casefold(), 'ss')
803 self.assertEqual('fi'.casefold(), 'fi')
804 self.assertEqual('\u03a3'.casefold(), '\u03c3')
805 self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
Benjamin Peterson4eda9372012-08-05 15:05:34 -0700806 self.assertEqual('\u00b5'.casefold(), '\u03bc')
Benjamin Petersond5890c82012-01-14 13:23:30 -0500807
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300808 def test_upper(self):
809 string_tests.CommonTest.test_upper(self)
810 self.assertEqual('\U0001044F'.upper(), '\U00010427')
811 self.assertEqual('\U0001044F\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300812 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300813 self.assertEqual('\U00010427\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300814 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300815 self.assertEqual('X\U00010427x\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300816 'X\U00010427X\U00010427')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500817 self.assertEqual('fi'.upper(), 'FI')
818 self.assertEqual('\u0130'.upper(), '\u0130')
819 self.assertEqual('\u03a3'.upper(), '\u03a3')
820 self.assertEqual('ß'.upper(), 'SS')
821 self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
822 self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
823 self.assertEqual('\u2177'.upper(), '\u2167')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300824
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300825 def test_capitalize(self):
826 string_tests.CommonTest.test_capitalize(self)
827 self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
828 self.assertEqual('\U0001044F\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300829 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300830 self.assertEqual('\U00010427\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300831 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300832 self.assertEqual('\U0001044F\U00010427'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300833 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300834 self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300835 'X\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500836 self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
837 exp = '\u0399\u0308\u0300\u0069\u0307'
838 self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
Kingsley Mb015fc82019-04-12 16:35:39 +0100839 self.assertEqual('finnish'.capitalize(), 'Finnish')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500840 self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300841
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300842 def test_title(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000843 super().test_title()
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300844 self.assertEqual('\U0001044F'.title(), '\U00010427')
845 self.assertEqual('\U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300846 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300847 self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300848 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300849 self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300850 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300851 self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300852 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300853 self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300854 'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500855 self.assertEqual('fiNNISH'.title(), 'Finnish')
856 self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
857 self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300858
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300859 def test_swapcase(self):
860 string_tests.CommonTest.test_swapcase(self)
861 self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
862 self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
863 self.assertEqual('\U0001044F\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300864 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300865 self.assertEqual('\U00010427\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300866 '\U0001044F\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300867 self.assertEqual('\U0001044F\U00010427'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300868 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300869 self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300870 'x\U0001044FX\U00010427')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500871 self.assertEqual('fi'.swapcase(), 'FI')
872 self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
873 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
874 self.assertEqual('\u03a3'.swapcase(), '\u03c3')
875 self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
876 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
877 self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
878 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
879 self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
880 self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
881 self.assertEqual('\u03a3'.swapcase(), '\u03c3')
882 self.assertEqual('ß'.swapcase(), 'SS')
883 self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300884
Ezio Melottif84e01d2013-07-08 17:48:29 +0200885 def test_center(self):
886 string_tests.CommonTest.test_center(self)
887 self.assertEqual('x'.center(2, '\U0010FFFF'),
888 'x\U0010FFFF')
889 self.assertEqual('x'.center(3, '\U0010FFFF'),
890 '\U0010FFFFx\U0010FFFF')
891 self.assertEqual('x'.center(4, '\U0010FFFF'),
892 '\U0010FFFFx\U0010FFFF\U0010FFFF')
893
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400894 @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system")
Benjamin Peterson4d856892014-10-15 13:39:46 -0400895 @support.cpython_only
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400896 def test_case_operation_overflow(self):
897 # Issue #22643
Serhiy Storchaka411dfd82015-11-07 16:54:48 +0200898 size = 2**32//12 + 1
899 try:
900 s = "ü" * size
901 except MemoryError:
902 self.skipTest('no enough memory (%.0f MiB required)' % (size / 2**20))
903 try:
904 self.assertRaises(OverflowError, s.upper)
905 finally:
906 del s
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400907
Walter Dörwald28256f22003-01-19 16:59:20 +0000908 def test_contains(self):
909 # Testing Unicode contains method
Benjamin Peterson577473f2010-01-19 00:09:57 +0000910 self.assertIn('a', 'abdb')
911 self.assertIn('a', 'bdab')
912 self.assertIn('a', 'bdaba')
913 self.assertIn('a', 'bdba')
914 self.assertNotIn('a', 'bdb')
915 self.assertIn('a', 'bdba')
916 self.assertIn('a', ('a',1,None))
917 self.assertIn('a', (1,None,'a'))
918 self.assertIn('a', ('a',1,None))
919 self.assertIn('a', (1,None,'a'))
920 self.assertNotIn('a', ('x',1,'y'))
921 self.assertNotIn('a', ('x',1,None))
922 self.assertNotIn('abcd', 'abcxxxx')
923 self.assertIn('ab', 'abcd')
924 self.assertIn('ab', 'abc')
925 self.assertIn('ab', (1,None,'ab'))
926 self.assertIn('', 'abc')
927 self.assertIn('', '')
928 self.assertIn('', 'abc')
929 self.assertNotIn('\0', 'abc')
930 self.assertIn('\0', '\0abc')
931 self.assertIn('\0', 'abc\0')
932 self.assertIn('a', '\0abc')
933 self.assertIn('asdf', 'asdf')
934 self.assertNotIn('asdf', 'asd')
935 self.assertNotIn('asdf', '')
Walter Dörwald28256f22003-01-19 16:59:20 +0000936
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000937 self.assertRaises(TypeError, "abc".__contains__)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200938 # test mixed kinds
939 for fill in ('a', '\u0100', '\U00010300'):
940 fill *= 9
941 for delim in ('c', '\u0102', '\U00010302'):
942 self.assertNotIn(delim, fill)
943 self.assertIn(delim, fill + delim)
944 self.assertNotIn(delim * 2, fill)
945 self.assertIn(delim * 2, fill + delim * 2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000946
Serhiy Storchaka31b1c8b2013-06-12 09:20:44 +0300947 def test_issue18183(self):
948 '\U00010000\U00100000'.lower()
949 '\U00010000\U00100000'.casefold()
950 '\U00010000\U00100000'.upper()
951 '\U00010000\U00100000'.capitalize()
952 '\U00010000\U00100000'.title()
953 '\U00010000\U00100000'.swapcase()
954 '\U00100000'.center(3, '\U00010000')
955 '\U00100000'.ljust(3, '\U00010000')
956 '\U00100000'.rjust(3, '\U00010000')
957
Eric Smith8c663262007-08-25 02:26:07 +0000958 def test_format(self):
959 self.assertEqual(''.format(), '')
960 self.assertEqual('a'.format(), 'a')
961 self.assertEqual('ab'.format(), 'ab')
962 self.assertEqual('a{{'.format(), 'a{')
963 self.assertEqual('a}}'.format(), 'a}')
964 self.assertEqual('{{b'.format(), '{b')
965 self.assertEqual('}}b'.format(), '}b')
966 self.assertEqual('a{{b'.format(), 'a{b')
967
968 # examples from the PEP:
969 import datetime
970 self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
971 self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
972 "My name is Fred")
973 self.assertEqual("My name is {0} :-{{}}".format('Fred'),
974 "My name is Fred :-{}")
975
976 d = datetime.date(2007, 8, 18)
977 self.assertEqual("The year is {0.year}".format(d),
978 "The year is 2007")
979
Eric Smith8c663262007-08-25 02:26:07 +0000980 # classes we'll use for testing
981 class C:
982 def __init__(self, x=100):
983 self._x = x
984 def __format__(self, spec):
985 return spec
986
987 class D:
988 def __init__(self, x):
989 self.x = x
990 def __format__(self, spec):
991 return str(self.x)
992
993 # class with __str__, but no __format__
994 class E:
995 def __init__(self, x):
996 self.x = x
997 def __str__(self):
998 return 'E(' + self.x + ')'
999
1000 # class with __repr__, but no __format__ or __str__
1001 class F:
1002 def __init__(self, x):
1003 self.x = x
1004 def __repr__(self):
1005 return 'F(' + self.x + ')'
1006
1007 # class with __format__ that forwards to string, for some format_spec's
1008 class G:
1009 def __init__(self, x):
1010 self.x = x
1011 def __str__(self):
1012 return "string is " + self.x
1013 def __format__(self, format_spec):
1014 if format_spec == 'd':
1015 return 'G(' + self.x + ')'
1016 return object.__format__(self, format_spec)
1017
Eric Smith739e2ad2007-08-27 19:07:22 +00001018 class I(datetime.date):
1019 def __format__(self, format_spec):
1020 return self.strftime(format_spec)
1021
Eric Smith185e30c2007-08-30 22:23:08 +00001022 class J(int):
1023 def __format__(self, format_spec):
1024 return int.__format__(self * 2, format_spec)
1025
Guido van Rossum97c1adf2016-08-18 09:22:23 -07001026 class M:
1027 def __init__(self, x):
1028 self.x = x
1029 def __repr__(self):
1030 return 'M(' + self.x + ')'
1031 __str__ = None
1032
1033 class N:
1034 def __init__(self, x):
1035 self.x = x
1036 def __repr__(self):
1037 return 'N(' + self.x + ')'
1038 __format__ = None
Eric Smith8c663262007-08-25 02:26:07 +00001039
1040 self.assertEqual(''.format(), '')
1041 self.assertEqual('abc'.format(), 'abc')
1042 self.assertEqual('{0}'.format('abc'), 'abc')
1043 self.assertEqual('{0:}'.format('abc'), 'abc')
1044# self.assertEqual('{ 0 }'.format('abc'), 'abc')
1045 self.assertEqual('X{0}'.format('abc'), 'Xabc')
1046 self.assertEqual('{0}X'.format('abc'), 'abcX')
1047 self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
1048 self.assertEqual('{1}'.format(1, 'abc'), 'abc')
1049 self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
1050 self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
1051 self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
1052 self.assertEqual('{0}'.format(-15), '-15')
1053 self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
1054 self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
1055 self.assertEqual('{{'.format(), '{')
1056 self.assertEqual('}}'.format(), '}')
1057 self.assertEqual('{{}}'.format(), '{}')
1058 self.assertEqual('{{x}}'.format(), '{x}')
1059 self.assertEqual('{{{0}}}'.format(123), '{123}')
1060 self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
1061 self.assertEqual('}}{{'.format(), '}{')
1062 self.assertEqual('}}x{{'.format(), '}x{')
1063
Eric Smith7ade6482007-08-26 22:27:13 +00001064 # weird field names
1065 self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
1066 self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
Eric Smith4cb4e4e2007-09-03 08:40:29 +00001067 self.assertEqual("{0[ ]}".format({' ':3}), '3')
Eric Smith7ade6482007-08-26 22:27:13 +00001068
Eric Smith8c663262007-08-25 02:26:07 +00001069 self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
1070 self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
1071 self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
1072 self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
1073 self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
1074 self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
1075 self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
1076
Eric Smith8c663262007-08-25 02:26:07 +00001077 # strings
1078 self.assertEqual('{0:.3s}'.format('abc'), 'abc')
1079 self.assertEqual('{0:.3s}'.format('ab'), 'ab')
1080 self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
1081 self.assertEqual('{0:.0s}'.format('abcdef'), '')
1082 self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
1083 self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
1084 self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
1085 self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
1086 self.assertEqual('{0:x<0s}'.format('result'), 'result')
1087 self.assertEqual('{0:x<5s}'.format('result'), 'result')
1088 self.assertEqual('{0:x<6s}'.format('result'), 'result')
1089 self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
1090 self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
1091 self.assertEqual('{0: <7s}'.format('result'), 'result ')
1092 self.assertEqual('{0:<7s}'.format('result'), 'result ')
1093 self.assertEqual('{0:>7s}'.format('result'), ' result')
1094 self.assertEqual('{0:>8s}'.format('result'), ' result')
1095 self.assertEqual('{0:^8s}'.format('result'), ' result ')
1096 self.assertEqual('{0:^9s}'.format('result'), ' result ')
1097 self.assertEqual('{0:^10s}'.format('result'), ' result ')
1098 self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
1099 self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
1100 self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
1101
Eric V. Smith2ea97122014-04-14 11:55:10 -04001102 # issue 12546: use \x00 as a fill character
1103 self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
1104 self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
1105 self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
1106 self.assertEqual('{0:^6s}'.format('foo'), ' foo ')
1107
1108 self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
1109 self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
1110 self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
1111 self.assertEqual('{0:<6}'.format(3), '3 ')
1112
1113 self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
1114 self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
1115 self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
1116 self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
1117
1118 self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
1119 self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
1120 self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
1121 self.assertEqual('{0:^12}'.format(3+2.0j), ' (3+2j) ')
1122
Eric Smith8c663262007-08-25 02:26:07 +00001123 # format specifiers for user defined type
1124 self.assertEqual('{0:abc}'.format(C()), 'abc')
1125
Georg Brandld52429f2008-07-04 15:55:02 +00001126 # !r, !s and !a coercions
Eric Smith8c663262007-08-25 02:26:07 +00001127 self.assertEqual('{0!s}'.format('Hello'), 'Hello')
1128 self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
1129 self.assertEqual('{0!s:15}'.format('Hello'), 'Hello ')
1130 self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello ')
1131 self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
1132 self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
1133 self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +00001134 self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +00001135 self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'") # printable
1136 self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
Georg Brandl559e5d72008-06-11 18:37:52 +00001137 self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +00001138 self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +00001139 self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
Georg Brandl559e5d72008-06-11 18:37:52 +00001140 self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
1141 self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
Georg Brandld52429f2008-07-04 15:55:02 +00001142 self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
Eric Smith8c663262007-08-25 02:26:07 +00001143
Eric Smith8c663262007-08-25 02:26:07 +00001144 # test fallback to object.__format__
1145 self.assertEqual('{0}'.format({}), '{}')
1146 self.assertEqual('{0}'.format([]), '[]')
1147 self.assertEqual('{0}'.format([1]), '[1]')
Eric Smithe4d63172010-09-13 20:48:43 +00001148
Eric Smith8c663262007-08-25 02:26:07 +00001149 self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
Eric Smith8c663262007-08-25 02:26:07 +00001150 self.assertEqual('{0!s}'.format(G('data')), 'string is data')
1151
Andrew Svetlov2cd8ce42012-12-23 14:27:17 +02001152 self.assertRaises(TypeError, '{0:^10}'.format, E('data'))
1153 self.assertRaises(TypeError, '{0:^10s}'.format, E('data'))
1154 self.assertRaises(TypeError, '{0:>15s}'.format, G('data'))
Eric Smithe4d63172010-09-13 20:48:43 +00001155
Eric Smith739e2ad2007-08-27 19:07:22 +00001156 self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
1157 month=8,
1158 day=27)),
1159 "date: 2007-08-27")
1160
Eric Smith185e30c2007-08-30 22:23:08 +00001161 # test deriving from a builtin type and overriding __format__
1162 self.assertEqual("{0}".format(J(10)), "20")
1163
1164
Eric Smith8c663262007-08-25 02:26:07 +00001165 # string format specifiers
1166 self.assertEqual('{0:}'.format('a'), 'a')
1167
1168 # computed format specifiers
1169 self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
1170 self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
1171 self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
1172 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello ')
1173 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello ')
1174
1175 # test various errors
1176 self.assertRaises(ValueError, '{'.format)
1177 self.assertRaises(ValueError, '}'.format)
1178 self.assertRaises(ValueError, 'a{'.format)
1179 self.assertRaises(ValueError, 'a}'.format)
1180 self.assertRaises(ValueError, '{a'.format)
1181 self.assertRaises(ValueError, '}a'.format)
Eric Smith11529192007-09-04 23:04:22 +00001182 self.assertRaises(IndexError, '{0}'.format)
1183 self.assertRaises(IndexError, '{1}'.format, 'abc')
1184 self.assertRaises(KeyError, '{x}'.format)
Eric Smith8c663262007-08-25 02:26:07 +00001185 self.assertRaises(ValueError, "}{".format)
Eric Smith8c663262007-08-25 02:26:07 +00001186 self.assertRaises(ValueError, "abc{0:{}".format)
1187 self.assertRaises(ValueError, "{0".format)
Eric Smith11529192007-09-04 23:04:22 +00001188 self.assertRaises(IndexError, "{0.}".format)
1189 self.assertRaises(ValueError, "{0.}".format, 0)
Benjamin Peterson4d944742013-05-17 18:22:31 -05001190 self.assertRaises(ValueError, "{0[}".format)
Eric Smith4cb4e4e2007-09-03 08:40:29 +00001191 self.assertRaises(ValueError, "{0[}".format, [])
Eric Smith11529192007-09-04 23:04:22 +00001192 self.assertRaises(KeyError, "{0]}".format)
1193 self.assertRaises(ValueError, "{0.[]}".format, 0)
Eric Smith7ade6482007-08-26 22:27:13 +00001194 self.assertRaises(ValueError, "{0..foo}".format, 0)
Eric Smith11529192007-09-04 23:04:22 +00001195 self.assertRaises(ValueError, "{0[0}".format, 0)
1196 self.assertRaises(ValueError, "{0[0:foo}".format, 0)
1197 self.assertRaises(KeyError, "{c]}".format)
1198 self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
1199 self.assertRaises(ValueError, "{0}}".format, 0)
1200 self.assertRaises(KeyError, "{foo}".format, bar=3)
Eric Smith8c663262007-08-25 02:26:07 +00001201 self.assertRaises(ValueError, "{0!x}".format, 3)
Eric Smith11529192007-09-04 23:04:22 +00001202 self.assertRaises(ValueError, "{0!}".format, 0)
1203 self.assertRaises(ValueError, "{0!rs}".format, 0)
Eric Smith8c663262007-08-25 02:26:07 +00001204 self.assertRaises(ValueError, "{!}".format)
Eric Smith8ec90442009-03-14 12:29:34 +00001205 self.assertRaises(IndexError, "{:}".format)
1206 self.assertRaises(IndexError, "{:s}".format)
1207 self.assertRaises(IndexError, "{}".format)
Benjamin Peterson59a1b2f2010-06-07 22:31:26 +00001208 big = "23098475029384702983476098230754973209482573"
1209 self.assertRaises(ValueError, ("{" + big + "}").format)
1210 self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
Eric Smith8c663262007-08-25 02:26:07 +00001211
Eric Smith41669ca2009-05-23 14:23:22 +00001212 # issue 6089
1213 self.assertRaises(ValueError, "{0[0]x}".format, [None])
1214 self.assertRaises(ValueError, "{0[0](10)}".format, [None])
1215
Eric Smith8c663262007-08-25 02:26:07 +00001216 # can't have a replacement on the field name portion
1217 self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
1218
1219 # exceed maximum recursion depth
1220 self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
1221 self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1222 0, 1, 2, 3, 4, 5, 6, 7)
1223
1224 # string format spec errors
1225 self.assertRaises(ValueError, "{0:-s}".format, '')
1226 self.assertRaises(ValueError, format, "", "-")
1227 self.assertRaises(ValueError, "{0:=s}".format, '')
1228
Eric Smithb1ebcc62008-07-15 13:02:41 +00001229 # Alternate formatting is not supported
1230 self.assertRaises(ValueError, format, '', '#')
1231 self.assertRaises(ValueError, format, '', '#20')
1232
Victor Stinnerece58de2012-04-23 23:36:38 +02001233 # Non-ASCII
1234 self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"),
1235 'ABC\u0410\u0411\u0412')
1236 self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"),
1237 'ABC')
1238 self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"),
1239 '')
1240
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001241 self.assertEqual("{[{}]}".format({"{}": 5}), "5")
Benjamin Peterson4d944742013-05-17 18:22:31 -05001242 self.assertEqual("{[{}]}".format({"{}" : "a"}), "a")
1243 self.assertEqual("{[{]}".format({"{" : "a"}), "a")
1244 self.assertEqual("{[}]}".format({"}" : "a"}), "a")
1245 self.assertEqual("{[[]}".format({"[" : "a"}), "a")
1246 self.assertEqual("{[!]}".format({"!" : "a"}), "a")
1247 self.assertRaises(ValueError, "{a{}b}".format, 42)
1248 self.assertRaises(ValueError, "{a{b}".format, 42)
1249 self.assertRaises(ValueError, "{[}".format, 42)
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001250
Benjamin Peterson0ee22bf2013-11-26 19:22:36 -06001251 self.assertEqual("0x{:0{:d}X}".format(0x0,16), "0x0000000000000000")
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001252
Guido van Rossum97c1adf2016-08-18 09:22:23 -07001253 # Blocking fallback
1254 m = M('data')
1255 self.assertEqual("{!r}".format(m), 'M(data)')
1256 self.assertRaises(TypeError, "{!s}".format, m)
1257 self.assertRaises(TypeError, "{}".format, m)
1258 n = N('data')
1259 self.assertEqual("{!r}".format(n), 'N(data)')
1260 self.assertEqual("{!s}".format(n), 'N(data)')
1261 self.assertRaises(TypeError, "{}".format, n)
1262
Eric Smith27bbca62010-11-04 17:06:58 +00001263 def test_format_map(self):
1264 self.assertEqual(''.format_map({}), '')
1265 self.assertEqual('a'.format_map({}), 'a')
1266 self.assertEqual('ab'.format_map({}), 'ab')
1267 self.assertEqual('a{{'.format_map({}), 'a{')
1268 self.assertEqual('a}}'.format_map({}), 'a}')
1269 self.assertEqual('{{b'.format_map({}), '{b')
1270 self.assertEqual('}}b'.format_map({}), '}b')
1271 self.assertEqual('a{{b'.format_map({}), 'a{b')
1272
1273 # using mappings
1274 class Mapping(dict):
1275 def __missing__(self, key):
1276 return key
1277 self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
1278 self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
1279
1280 class InternalMapping:
1281 def __init__(self):
1282 self.mapping = {'a': 'hello'}
1283 def __getitem__(self, key):
1284 return self.mapping[key]
1285 self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
1286
1287
Eric Smith27bbca62010-11-04 17:06:58 +00001288 class C:
1289 def __init__(self, x=100):
1290 self._x = x
1291 def __format__(self, spec):
1292 return spec
Eric Smith27bbca62010-11-04 17:06:58 +00001293 self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
1294
1295 # test various errors
Eric V. Smithedbb6ca2012-03-12 15:16:22 -07001296 self.assertRaises(TypeError, ''.format_map)
1297 self.assertRaises(TypeError, 'a'.format_map)
1298
1299 self.assertRaises(ValueError, '{'.format_map, {})
1300 self.assertRaises(ValueError, '}'.format_map, {})
1301 self.assertRaises(ValueError, 'a{'.format_map, {})
1302 self.assertRaises(ValueError, 'a}'.format_map, {})
1303 self.assertRaises(ValueError, '{a'.format_map, {})
1304 self.assertRaises(ValueError, '}a'.format_map, {})
Eric Smith27bbca62010-11-04 17:06:58 +00001305
Eric V. Smith12ebefc2011-07-18 14:03:41 -04001306 # issue #12579: can't supply positional params to format_map
1307 self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
1308 self.assertRaises(ValueError, '{}'.format_map, 'a')
1309 self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
1310
Serhiy Storchaka50754162017-08-03 11:45:23 +03001311 class BadMapping:
1312 def __getitem__(self, key):
1313 return 1/0
1314 self.assertRaises(KeyError, '{a}'.format_map, {})
1315 self.assertRaises(TypeError, '{a}'.format_map, [])
1316 self.assertRaises(ZeroDivisionError, '{a}'.format_map, BadMapping())
1317
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001318 def test_format_huge_precision(self):
1319 format_string = ".{}f".format(sys.maxsize + 1)
1320 with self.assertRaises(ValueError):
1321 result = format(2.34, format_string)
1322
1323 def test_format_huge_width(self):
1324 format_string = "{}f".format(sys.maxsize + 1)
1325 with self.assertRaises(ValueError):
1326 result = format(2.34, format_string)
1327
1328 def test_format_huge_item_number(self):
1329 format_string = "{{{}:.6f}}".format(sys.maxsize + 1)
1330 with self.assertRaises(ValueError):
1331 result = format_string.format(2.34)
1332
Eric Smith8ec90442009-03-14 12:29:34 +00001333 def test_format_auto_numbering(self):
1334 class C:
1335 def __init__(self, x=100):
1336 self._x = x
1337 def __format__(self, spec):
1338 return spec
1339
1340 self.assertEqual('{}'.format(10), '10')
1341 self.assertEqual('{:5}'.format('s'), 's ')
1342 self.assertEqual('{!r}'.format('s'), "'s'")
1343 self.assertEqual('{._x}'.format(C(10)), '10')
1344 self.assertEqual('{[1]}'.format([1, 2]), '2')
1345 self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
1346 self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
1347
1348 self.assertEqual('a{:{}}b'.format('x', '^10'), 'a x b')
1349 self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
1350
1351 # can't mix and match numbering and auto-numbering
1352 self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
1353 self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
1354 self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
1355 self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
1356
1357 # can mix and match auto-numbering and named
1358 self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
1359 self.assertEqual('{}{f}'.format(4, f='test'), '4test')
1360 self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
1361 self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
1362
Walter Dörwald28256f22003-01-19 16:59:20 +00001363 def test_formatting(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +00001364 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
Walter Dörwald28256f22003-01-19 16:59:20 +00001365 # Testing Unicode formatting strings...
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001366 self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
1367 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000, 3.00')
1368 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000, 3.00')
1369 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000, 3.50')
1370 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000, 3.57')
1371 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
Walter Dörwald28256f22003-01-19 16:59:20 +00001372 if not sys.platform.startswith('java'):
Walter Dörwald67e83882007-05-05 12:26:27 +00001373 self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
Georg Brandl559e5d72008-06-11 18:37:52 +00001374 self.assertEqual("%r" % ("\u1234",), "'\u1234'")
1375 self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001376 self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
1377 self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
Walter Dörwald56fbcb52003-03-31 18:18:41 +00001378
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001379 self.assertEqual('%c' % 0x1234, '\u1234')
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00001380 self.assertEqual('%c' % 0x21483, '\U00021483')
1381 self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
1382 self.assertEqual('%c' % '\U00021483', '\U00021483')
1383 self.assertRaises(TypeError, "%c".__mod__, "aa")
Stefan Krah99212f62010-07-19 17:58:26 +00001384 self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -07001385 self.assertRaises(TypeError, "%i".__mod__, "aa")
Walter Dörwald28256f22003-01-19 16:59:20 +00001386
1387 # formatting jobs delegated from the string implementation:
Walter Dörwald28256f22003-01-19 16:59:20 +00001388 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001389 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1390 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1391 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1392 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1393 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1394 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
1395 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
1396 self.assertEqual('...%s...' % "abc", '...abc...')
1397 self.assertEqual('%*s' % (5,'abc',), ' abc')
1398 self.assertEqual('%*s' % (-5,'abc',), 'abc ')
1399 self.assertEqual('%*.*s' % (5,2,'abc',), ' ab')
1400 self.assertEqual('%*.*s' % (5,3,'abc',), ' abc')
1401 self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10 abc')
1402 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103 abc')
1403 self.assertEqual('%c' % 'a', 'a')
Neil Schemenauercf52c072005-08-12 17:34:58 +00001404 class Wrapper:
1405 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001406 return '\u1234'
1407 self.assertEqual('%s' % Wrapper(), '\u1234')
Walter Dörwald28256f22003-01-19 16:59:20 +00001408
Eric Smith741191f2009-05-06 13:08:15 +00001409 # issue 3382
1410 NAN = float('nan')
1411 INF = float('inf')
1412 self.assertEqual('%f' % NAN, 'nan')
1413 self.assertEqual('%F' % NAN, 'NAN')
1414 self.assertEqual('%f' % INF, 'inf')
1415 self.assertEqual('%F' % INF, 'INF')
1416
Victor Stinnerf59c28c2012-05-09 03:24:14 +02001417 # PEP 393
1418 self.assertEqual('%.1s' % "a\xe9\u20ac", 'a')
1419 self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9')
1420
Ethan Furmandf3ed242014-01-05 06:50:30 -08001421 #issue 19995
Ethan Furman9ab74802014-03-21 06:38:46 -07001422 class PseudoInt:
Ethan Furmandf3ed242014-01-05 06:50:30 -08001423 def __init__(self, value):
1424 self.value = int(value)
1425 def __int__(self):
1426 return self.value
1427 def __index__(self):
1428 return self.value
Ethan Furman9ab74802014-03-21 06:38:46 -07001429 class PseudoFloat:
Ethan Furmandf3ed242014-01-05 06:50:30 -08001430 def __init__(self, value):
1431 self.value = float(value)
1432 def __int__(self):
1433 return int(self.value)
Ethan Furman9ab74802014-03-21 06:38:46 -07001434 pi = PseudoFloat(3.1415)
1435 letter_m = PseudoInt(109)
Antoine Pitroueb168042014-01-10 00:02:38 +01001436 self.assertEqual('%x' % 42, '2a')
1437 self.assertEqual('%X' % 15, 'F')
1438 self.assertEqual('%o' % 9, '11')
1439 self.assertEqual('%c' % 109, 'm')
1440 self.assertEqual('%x' % letter_m, '6d')
1441 self.assertEqual('%X' % letter_m, '6D')
1442 self.assertEqual('%o' % letter_m, '155')
1443 self.assertEqual('%c' % letter_m, 'm')
Ethan Furman9ab74802014-03-21 06:38:46 -07001444 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14),
1445 self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11),
1446 self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79),
1447 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi),
1448 self.assertRaises(TypeError, operator.mod, '%c', pi),
Ethan Furmandf3ed242014-01-05 06:50:30 -08001449
Ethan Furmanfb137212013-08-31 10:18:55 -07001450 def test_formatting_with_enum(self):
1451 # issue18780
1452 import enum
1453 class Float(float, enum.Enum):
1454 PI = 3.1415926
1455 class Int(enum.IntEnum):
1456 IDES = 15
1457 class Str(str, enum.Enum):
1458 ABC = 'abc'
1459 # Testing Unicode formatting strings...
Ethan Furman13bdfa72013-08-31 12:48:51 -07001460 self.assertEqual("%s, %s" % (Str.ABC, Str.ABC),
1461 'Str.ABC, Str.ABC')
1462 self.assertEqual("%s, %s, %d, %i, %u, %f, %5.2f" %
1463 (Str.ABC, Str.ABC,
1464 Int.IDES, Int.IDES, Int.IDES,
1465 Float.PI, Float.PI),
1466 'Str.ABC, Str.ABC, 15, 15, 15, 3.141593, 3.14')
Ethan Furmanfb137212013-08-31 10:18:55 -07001467
1468 # formatting jobs delegated from the string implementation:
Ethan Furman13bdfa72013-08-31 12:48:51 -07001469 self.assertEqual('...%(foo)s...' % {'foo':Str.ABC},
1470 '...Str.ABC...')
1471 self.assertEqual('...%(foo)s...' % {'foo':Int.IDES},
1472 '...Int.IDES...')
1473 self.assertEqual('...%(foo)i...' % {'foo':Int.IDES},
1474 '...15...')
1475 self.assertEqual('...%(foo)d...' % {'foo':Int.IDES},
1476 '...15...')
1477 self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI},
1478 '...15...')
1479 self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123},
1480 '...3.141593...')
Ethan Furmanfb137212013-08-31 10:18:55 -07001481
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001482 def test_formatting_huge_precision(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001483 format_string = "%.{}f".format(sys.maxsize + 1)
1484 with self.assertRaises(ValueError):
1485 result = format_string % 2.34
1486
Martijn Pietersd7e64332017-02-23 13:38:04 +00001487 def test_issue28598_strsubclass_rhs(self):
1488 # A subclass of str with an __rmod__ method should be able to hook
1489 # into the % operator
1490 class SubclassedStr(str):
1491 def __rmod__(self, other):
1492 return 'Success, self.__rmod__({!r}) was called'.format(other)
1493 self.assertEqual('lhs %% %r' % SubclassedStr('rhs'),
1494 "Success, self.__rmod__('lhs %% %r') was called")
1495
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001496 @support.cpython_only
1497 def test_formatting_huge_precision_c_limits(self):
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001498 from _testcapi import INT_MAX
1499 format_string = "%.{}f".format(INT_MAX + 1)
1500 with self.assertRaises(ValueError):
1501 result = format_string % 2.34
1502
1503 def test_formatting_huge_width(self):
1504 format_string = "%{}f".format(sys.maxsize + 1)
1505 with self.assertRaises(ValueError):
1506 result = format_string % 2.34
1507
Ezio Melottiba42fd52011-04-26 06:09:45 +03001508 def test_startswith_endswith_errors(self):
1509 for meth in ('foo'.startswith, 'foo'.endswith):
Ezio Melottif2b3f782011-04-26 06:40:59 +03001510 with self.assertRaises(TypeError) as cm:
Ezio Melottiba42fd52011-04-26 06:09:45 +03001511 meth(['f'])
Ezio Melottif2b3f782011-04-26 06:40:59 +03001512 exc = str(cm.exception)
Ezio Melottiba42fd52011-04-26 06:09:45 +03001513 self.assertIn('str', exc)
1514 self.assertIn('tuple', exc)
1515
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001516 @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
Georg Brandlda6b1072006-01-20 17:48:54 +00001517 def test_format_float(self):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001518 # should not format with a comma, but always with C locale
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001519 self.assertEqual('1.0', '%.1f' % 1.0)
Georg Brandlda6b1072006-01-20 17:48:54 +00001520
Walter Dörwald28256f22003-01-19 16:59:20 +00001521 def test_constructor(self):
1522 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1523
1524 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001525 str('unicode remains unicode'),
1526 'unicode remains unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001527 )
1528
Victor Stinner07ac3eb2011-10-01 16:16:43 +02001529 for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
Serhiy Storchaka15095802015-11-25 15:47:01 +02001530 subclass = StrSubclass(text)
Victor Stinner07ac3eb2011-10-01 16:16:43 +02001531 self.assertEqual(str(subclass), text)
1532 self.assertEqual(len(subclass), len(text))
1533 if text == 'ascii':
1534 self.assertEqual(subclass.encode('ascii'), b'ascii')
1535 self.assertEqual(subclass.encode('utf-8'), b'ascii')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001536
Walter Dörwald28256f22003-01-19 16:59:20 +00001537 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001538 str('strings are converted to unicode'),
1539 'strings are converted to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001540 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001541
Walter Dörwald28256f22003-01-19 16:59:20 +00001542 class StringCompat:
1543 def __init__(self, x):
1544 self.x = x
1545 def __str__(self):
1546 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001547
Walter Dörwald28256f22003-01-19 16:59:20 +00001548 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001549 str(StringCompat('__str__ compatible objects are recognized')),
1550 '__str__ compatible objects are recognized'
Walter Dörwald28256f22003-01-19 16:59:20 +00001551 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001552
Walter Dörwald28256f22003-01-19 16:59:20 +00001553 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001554
Walter Dörwald28256f22003-01-19 16:59:20 +00001555 o = StringCompat('unicode(obj) is compatible to str()')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001556 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Walter Dörwald28256f22003-01-19 16:59:20 +00001557 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001558
Guido van Rossume2a383d2007-01-15 16:59:06 +00001559 for obj in (123, 123.45, 123):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001560 self.assertEqual(str(obj), str(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001561
Walter Dörwald28256f22003-01-19 16:59:20 +00001562 # unicode(obj, encoding, error) tests (this maps to
1563 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001564
Walter Dörwald28256f22003-01-19 16:59:20 +00001565 if not sys.platform.startswith('java'):
1566 self.assertRaises(
1567 TypeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001568 str,
1569 'decoding unicode is not supported',
Walter Dörwald28256f22003-01-19 16:59:20 +00001570 'utf-8',
1571 'strict'
1572 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001573
Walter Dörwald28256f22003-01-19 16:59:20 +00001574 self.assertEqual(
Walter Dörwald67e83882007-05-05 12:26:27 +00001575 str(b'strings are decoded to unicode', 'utf-8', 'strict'),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001576 'strings are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001577 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001578
Walter Dörwald28256f22003-01-19 16:59:20 +00001579 if not sys.platform.startswith('java'):
1580 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001581 str(
Guido van Rossumbae07c92007-10-08 02:46:15 +00001582 memoryview(b'character buffers are decoded to unicode'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001583 'utf-8',
1584 'strict'
1585 ),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001586 'character buffers are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001587 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001588
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001589 self.assertRaises(TypeError, str, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001590
Chris Jerdonek5fae0e52012-11-20 17:45:51 -08001591 def test_constructor_keyword_args(self):
1592 """Pass various keyword argument combinations to the constructor."""
1593 # The object argument can be passed as a keyword.
1594 self.assertEqual(str(object='foo'), 'foo')
1595 self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo')
1596 # The errors argument without encoding triggers "decode" mode.
1597 self.assertEqual(str(b'foo', errors='strict'), 'foo') # not "b'foo'"
1598 self.assertEqual(str(object=b'foo', errors='strict'), 'foo')
1599
1600 def test_constructor_defaults(self):
1601 """Check the constructor argument defaults."""
1602 # The object argument defaults to '' or b''.
1603 self.assertEqual(str(), '')
1604 self.assertEqual(str(errors='strict'), '')
1605 utf8_cent = '¢'.encode('utf-8')
1606 # The encoding argument defaults to utf-8.
1607 self.assertEqual(str(utf8_cent, errors='strict'), '¢')
1608 # The errors argument defaults to strict.
1609 self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii')
1610
Walter Dörwald28256f22003-01-19 16:59:20 +00001611 def test_codecs_utf7(self):
1612 utfTests = [
Walter Dörwald67e83882007-05-05 12:26:27 +00001613 ('A\u2262\u0391.', b'A+ImIDkQ.'), # RFC2152 example
1614 ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'), # RFC2152 example
1615 ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'), # RFC2152 example
1616 ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1617 ('+', b'+-'),
1618 ('+-', b'+--'),
1619 ('+?', b'+-?'),
R David Murray44b548d2016-09-08 13:59:53 -04001620 (r'\?', b'+AFw?'),
Walter Dörwald67e83882007-05-05 12:26:27 +00001621 ('+?', b'+-?'),
1622 (r'\\?', b'+AFwAXA?'),
1623 (r'\\\?', b'+AFwAXABc?'),
Antoine Pitrou244651a2009-05-04 18:56:13 +00001624 (r'++--', b'+-+---'),
1625 ('\U000abcde', b'+2m/c3g-'), # surrogate pairs
1626 ('/', b'/'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001627 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001628
Walter Dörwald28256f22003-01-19 16:59:20 +00001629 for (x, y) in utfTests:
1630 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001631
Antoine Pitrou5418ee02011-11-15 01:42:21 +01001632 # Unpaired surrogates are passed through
1633 self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
1634 self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
1635 self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
1636 self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
1637 self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
1638 self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
1639 self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
1640 self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001641
Antoine Pitrou5418ee02011-11-15 01:42:21 +01001642 self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
1643 self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001644
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001645 # Issue #2242: crash on some Windows/MSVC versions
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001646 self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')
Antoine Pitrou244651a2009-05-04 18:56:13 +00001647
1648 # Direct encoded characters
1649 set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1650 # Optional direct characters
1651 set_o = '!"#$%&*;<=>@[]^_`{|}'
1652 for c in set_d:
1653 self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1654 self.assertEqual(c.encode('ascii').decode('utf7'), c)
1655 for c in set_o:
1656 self.assertEqual(c.encode('ascii').decode('utf7'), c)
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001657
Zackery Spytze349bf22018-08-18 22:43:38 -06001658 with self.assertRaisesRegex(UnicodeDecodeError,
1659 'ill-formed sequence'):
1660 b'+@'.decode('utf-7')
1661
Walter Dörwald28256f22003-01-19 16:59:20 +00001662 def test_codecs_utf8(self):
Walter Dörwald67e83882007-05-05 12:26:27 +00001663 self.assertEqual(''.encode('utf-8'), b'')
1664 self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
Ezio Melottia9860ae2011-10-04 19:06:00 +03001665 self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82')
1666 self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96')
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001667 self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1668 self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
Ezio Melottia9860ae2011-10-04 19:06:00 +03001669 self.assertEqual(('\U00010002'*10).encode('utf-8'),
1670 b'\xf0\x90\x80\x82'*10)
Walter Dörwald28256f22003-01-19 16:59:20 +00001671 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001672 '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1673 '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1674 '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1675 '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1676 '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1677 ' Nunstuck git und'.encode('utf-8'),
Walter Dörwald67e83882007-05-05 12:26:27 +00001678 b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1679 b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1680 b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1681 b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1682 b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1683 b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1684 b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1685 b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1686 b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1687 b'\xe3\x80\x8cWenn ist das Nunstuck git und'
Walter Dörwald28256f22003-01-19 16:59:20 +00001688 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001689
Walter Dörwald28256f22003-01-19 16:59:20 +00001690 # UTF-8 specific decoding tests
Walter Dörwald67e83882007-05-05 12:26:27 +00001691 self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1692 self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1693 self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001694
Walter Dörwald28256f22003-01-19 16:59:20 +00001695 # Other possible utf-8 test cases:
1696 # * strict decoding testing for all of the
1697 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001698
Ezio Melotti57221d02010-07-01 07:32:02 +00001699 def test_utf8_decode_valid_sequences(self):
1700 sequences = [
1701 # single byte
1702 (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1703 # 2 bytes
1704 (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1705 # 3 bytes
1706 (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1707 (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1708 # 4 bytes
1709 (b'\xF0\x90\x80\x80', '\U00010000'),
1710 (b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1711 ]
1712 for seq, res in sequences:
1713 self.assertEqual(seq.decode('utf-8'), res)
1714
1715
1716 def test_utf8_decode_invalid_sequences(self):
1717 # continuation bytes in a sequence of 2, 3, or 4 bytes
1718 continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001719 # start bytes of a 2-byte sequence equivalent to code points < 0x7F
Ezio Melotti57221d02010-07-01 07:32:02 +00001720 invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001721 # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
Ezio Melotti57221d02010-07-01 07:32:02 +00001722 invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1723 invalid_start_bytes = (
1724 continuation_bytes + invalid_2B_seq_start_bytes +
1725 invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1726 )
1727
1728 for byte in invalid_start_bytes:
1729 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1730
1731 for sb in invalid_2B_seq_start_bytes:
1732 for cb in continuation_bytes:
1733 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1734
1735 for sb in invalid_4B_seq_start_bytes:
1736 for cb1 in continuation_bytes[:3]:
1737 for cb3 in continuation_bytes[:3]:
1738 self.assertRaises(UnicodeDecodeError,
1739 (sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1740
1741 for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1742 self.assertRaises(UnicodeDecodeError,
1743 (b'\xE0'+cb+b'\x80').decode, 'utf-8')
1744 self.assertRaises(UnicodeDecodeError,
1745 (b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1746 # surrogates
1747 for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1748 self.assertRaises(UnicodeDecodeError,
1749 (b'\xED'+cb+b'\x80').decode, 'utf-8')
1750 self.assertRaises(UnicodeDecodeError,
1751 (b'\xED'+cb+b'\xBF').decode, 'utf-8')
1752 for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1753 self.assertRaises(UnicodeDecodeError,
1754 (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1755 self.assertRaises(UnicodeDecodeError,
1756 (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1757 for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1758 self.assertRaises(UnicodeDecodeError,
1759 (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1760 self.assertRaises(UnicodeDecodeError,
1761 (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1762
1763 def test_issue8271(self):
1764 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1765 # only the start byte and the continuation byte(s) are now considered
1766 # invalid, instead of the number of bytes specified by the start byte.
Benjamin Peterson51796e52020-03-10 21:10:59 -07001767 # See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
Ezio Melotti57221d02010-07-01 07:32:02 +00001768 # table 3-8, Row 2) for more information about the algorithm used.
1769 FFFD = '\ufffd'
1770 sequences = [
1771 # invalid start bytes
1772 (b'\x80', FFFD), # continuation byte
1773 (b'\x80\x80', FFFD*2), # 2 continuation bytes
1774 (b'\xc0', FFFD),
1775 (b'\xc0\xc0', FFFD*2),
1776 (b'\xc1', FFFD),
1777 (b'\xc1\xc0', FFFD*2),
1778 (b'\xc0\xc1', FFFD*2),
1779 # with start byte of a 2-byte sequence
1780 (b'\xc2', FFFD), # only the start byte
1781 (b'\xc2\xc2', FFFD*2), # 2 start bytes
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001782 (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes
Ezio Melotti57221d02010-07-01 07:32:02 +00001783 (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1784 # with start byte of a 3-byte sequence
1785 (b'\xe1', FFFD), # only the start byte
1786 (b'\xe1\xe1', FFFD*2), # 2 start bytes
1787 (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1788 (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1789 (b'\xe1\x80', FFFD), # only 1 continuation byte
1790 (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1791 (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1792 (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1793 (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1794 (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1795 (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1796 # with start byte of a 4-byte sequence
1797 (b'\xf1', FFFD), # only the start byte
1798 (b'\xf1\xf1', FFFD*2), # 2 start bytes
1799 (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1800 (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1801 (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1802 (b'\xf1\x80', FFFD), # only 1 continuation bytes
1803 (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1804 (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1805 (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1806 (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1807 (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1808 (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1809 (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1810 (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1811 (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1812 (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1813 (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1814 (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1815 # with invalid start byte of a 4-byte sequence (rfc2279)
1816 (b'\xf5', FFFD), # only the start byte
1817 (b'\xf5\xf5', FFFD*2), # 2 start bytes
1818 (b'\xf5\x80', FFFD*2), # only 1 continuation byte
1819 (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1820 (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1821 (b'\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
1822 (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1823 (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1824 # with invalid start byte of a 5-byte sequence (rfc2279)
1825 (b'\xf8', FFFD), # only the start byte
1826 (b'\xf8\xf8', FFFD*2), # 2 start bytes
1827 (b'\xf8\x80', FFFD*2), # only one continuation byte
1828 (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1829 (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1830 # with invalid start byte of a 6-byte sequence (rfc2279)
1831 (b'\xfc', FFFD), # only the start byte
1832 (b'\xfc\xfc', FFFD*2), # 2 start bytes
1833 (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1834 (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1835 # invalid start byte
1836 (b'\xfe', FFFD),
1837 (b'\xfe\x80\x80', FFFD*3),
1838 # other sequences
1839 (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1840 (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1841 (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1842 (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1843 '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1844 ]
1845 for n, (seq, res) in enumerate(sequences):
1846 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1847 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1848 self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1849 self.assertEqual(seq.decode('utf-8', 'ignore'),
1850 res.replace('\uFFFD', ''))
1851
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001852 def assertCorrectUTF8Decoding(self, seq, res, err):
1853 """
Martin Panter6245cb32016-04-15 02:14:19 +00001854 Check that an invalid UTF-8 sequence raises a UnicodeDecodeError when
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001855 'strict' is used, returns res when 'replace' is used, and that doesn't
1856 return anything when 'ignore' is used.
1857 """
1858 with self.assertRaises(UnicodeDecodeError) as cm:
1859 seq.decode('utf-8')
1860 exc = cm.exception
1861
1862 self.assertIn(err, str(exc))
1863 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1864 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'),
1865 'aaaa' + res + 'bbbb')
1866 res = res.replace('\ufffd', '')
1867 self.assertEqual(seq.decode('utf-8', 'ignore'), res)
1868 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'),
1869 'aaaa' + res + 'bbbb')
1870
1871 def test_invalid_start_byte(self):
1872 """
1873 Test that an 'invalid start byte' error is raised when the first byte
1874 is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
1875 4-bytes sequence. The invalid start byte is replaced with a single
1876 U+FFFD when errors='replace'.
1877 E.g. <80> is a continuation byte and can appear only after a start byte.
1878 """
1879 FFFD = '\ufffd'
1880 for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
1881 self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd',
1882 'invalid start byte')
1883
1884 def test_unexpected_end_of_data(self):
1885 """
1886 Test that an 'unexpected end of data' error is raised when the string
1887 ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
1888 enough continuation bytes. The incomplete sequence is replaced with a
1889 single U+FFFD when errors='replace'.
1890 E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
1891 sequence, but it's followed by only 2 valid continuation bytes and the
1892 last continuation bytes is missing.
1893 Note: the continuation bytes must be all valid, if one of them is
1894 invalid another error will be raised.
1895 """
1896 sequences = [
1897 'C2', 'DF',
1898 'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
1899 'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
1900 'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
1901 'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
1902 'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
1903 'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
1904 ]
1905 FFFD = '\ufffd'
1906 for seq in sequences:
Serhiy Storchaka8cbd3df2016-12-21 12:59:28 +02001907 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), '\ufffd',
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001908 'unexpected end of data')
1909
1910 def test_invalid_cb_for_2bytes_seq(self):
1911 """
1912 Test that an 'invalid continuation byte' error is raised when the
1913 continuation byte of a 2-bytes sequence is invalid. The start byte
1914 is replaced by a single U+FFFD and the second byte is handled
1915 separately when errors='replace'.
1916 E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
1917 sequence, but 41 is not a valid continuation byte because it's the
1918 ASCII letter 'A'.
1919 """
1920 FFFD = '\ufffd'
1921 FFFDx2 = FFFD * 2
1922 sequences = [
1923 ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'),
1924 ('C2 C0', FFFDx2), ('C2 FF', FFFDx2),
1925 ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'),
1926 ('DF C0', FFFDx2), ('DF FF', FFFDx2),
1927 ]
1928 for seq, res in sequences:
Serhiy Storchaka8cbd3df2016-12-21 12:59:28 +02001929 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001930 'invalid continuation byte')
1931
1932 def test_invalid_cb_for_3bytes_seq(self):
1933 """
1934 Test that an 'invalid continuation byte' error is raised when the
1935 continuation byte(s) of a 3-bytes sequence are invalid. When
1936 errors='replace', if the first continuation byte is valid, the first
1937 two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
1938 third byte is handled separately, otherwise only the start byte is
1939 replaced with a U+FFFD and the other continuation bytes are handled
1940 separately.
1941 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1942 sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1943 because it's the ASCII letter 'A'.
1944 Note: when the start byte is E0 or ED, the valid ranges for the first
1945 continuation byte are limited to A0..BF and 80..9F respectively.
1946 Python 2 used to consider all the bytes in range 80..BF valid when the
1947 start byte was ED. This is fixed in Python 3.
1948 """
1949 FFFD = '\ufffd'
1950 FFFDx2 = FFFD * 2
1951 sequences = [
1952 ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2),
1953 ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
1954 ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'),
1955 ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
1956 ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'),
1957 ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'),
1958 ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
1959 ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'),
1960 ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
1961 ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'),
1962 ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'),
1963 ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
1964 ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'),
1965 ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
1966 ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'),
1967 ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'),
1968 ('ED 7F', FFFD+'\x7f'),
1969 ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
1970 ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'),
1971 ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2),
1972 ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'),
1973 ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2),
1974 ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'),
1975 ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
1976 ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'),
1977 ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
1978 ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'),
1979 ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'),
1980 ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
1981 ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'),
1982 ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
1983 ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'),
1984 ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
1985 ]
1986 for seq, res in sequences:
Serhiy Storchaka8cbd3df2016-12-21 12:59:28 +02001987 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001988 'invalid continuation byte')
1989
1990 def test_invalid_cb_for_4bytes_seq(self):
1991 """
1992 Test that an 'invalid continuation byte' error is raised when the
1993 continuation byte(s) of a 4-bytes sequence are invalid. When
1994 errors='replace',the start byte and all the following valid
1995 continuation bytes are replaced with a single U+FFFD, and all the bytes
1996 starting from the first invalid continuation bytes (included) are
1997 handled separately.
1998 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1999 sequence, 80 is a valid continuation byte, but 41 is not a valid cb
2000 because it's the ASCII letter 'A'.
2001 Note: when the start byte is E0 or ED, the valid ranges for the first
2002 continuation byte are limited to A0..BF and 80..9F respectively.
2003 However, when the start byte is ED, Python 2 considers all the bytes
2004 in range 80..BF valid. This is fixed in Python 3.
2005 """
2006 FFFD = '\ufffd'
2007 FFFDx2 = FFFD * 2
2008 sequences = [
2009 ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2),
2010 ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
2011 ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'),
2012 ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
2013 ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'),
2014 ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
2015 ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'),
2016 ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
2017 ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'),
2018 ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
2019 ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'),
2020 ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
2021 ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'),
2022 ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
2023 ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2),
2024 ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'),
2025 ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2),
2026 ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'),
2027 ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2),
2028 ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'),
2029 ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2),
2030 ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'),
2031 ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2),
2032 ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'),
2033 ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2),
2034 ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'),
2035 ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2),
2036 ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'),
2037 ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
2038 ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'),
2039 ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
2040 ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'),
2041 ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
2042 ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'),
2043 ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
2044 ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'),
2045 ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
2046 ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'),
2047 ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
2048 ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'),
2049 ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
2050 ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2),
2051 ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
2052 ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'),
2053 ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
2054 ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'),
2055 ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
2056 ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'),
2057 ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
2058 ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'),
2059 ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
2060 ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'),
2061 ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
2062 ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'),
2063 ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
2064 ]
2065 for seq, res in sequences:
Serhiy Storchaka8cbd3df2016-12-21 12:59:28 +02002066 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
Ezio Melottif7ed5d12012-11-04 23:21:38 +02002067 'invalid continuation byte')
2068
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00002069 def test_codecs_idna(self):
2070 # Test whether trailing dot is preserved
Walter Dörwald1324c6f2007-05-11 19:57:05 +00002071 self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00002072
Walter Dörwald28256f22003-01-19 16:59:20 +00002073 def test_codecs_errors(self):
2074 # Error handling (encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002075 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
2076 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
Walter Dörwald67e83882007-05-05 12:26:27 +00002077 self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
2078 self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
Benjamin Peterson308d6372009-09-18 21:42:35 +00002079 self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
2080 'Andr\202 x'.encode('ascii', errors='replace'))
2081 self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
2082 'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002083
Walter Dörwald28256f22003-01-19 16:59:20 +00002084 # Error handling (decoding)
Walter Dörwald67e83882007-05-05 12:26:27 +00002085 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
2086 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
2087 self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
2088 self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03002089 self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002090
Walter Dörwald28256f22003-01-19 16:59:20 +00002091 # Error handling (unknown character names)
Guido van Rossum39478e82007-08-27 17:23:59 +00002092 self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002093
Walter Dörwald28256f22003-01-19 16:59:20 +00002094 # Error handling (truncated escape sequence)
Guido van Rossum9c627722007-08-27 18:31:48 +00002095 self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00002096
Guido van Rossum9c627722007-08-27 18:31:48 +00002097 self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
2098 self.assertRaises(TypeError, str, b"hello", "test.unicode2")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002099 self.assertRaises(TypeError, "hello".encode, "test.unicode1")
2100 self.assertRaises(TypeError, "hello".encode, "test.unicode2")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00002101
Walter Dörwald28256f22003-01-19 16:59:20 +00002102 # Error handling (wrong arguments)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002103 self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002104
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02002105 # Error handling (lone surrogate in
2106 # _PyUnicode_TransformDecimalAndSpaceToASCII())
2107 self.assertRaises(ValueError, int, "\ud800")
2108 self.assertRaises(ValueError, int, "\udf00")
2109 self.assertRaises(ValueError, float, "\ud800")
2110 self.assertRaises(ValueError, float, "\udf00")
2111 self.assertRaises(ValueError, complex, "\ud800")
2112 self.assertRaises(ValueError, complex, "\udf00")
Guido van Rossum97064862000-04-10 13:52:48 +00002113
Walter Dörwald28256f22003-01-19 16:59:20 +00002114 def test_codecs(self):
2115 # Encoding
Walter Dörwald67e83882007-05-05 12:26:27 +00002116 self.assertEqual('hello'.encode('ascii'), b'hello')
2117 self.assertEqual('hello'.encode('utf-7'), b'hello')
2118 self.assertEqual('hello'.encode('utf-8'), b'hello')
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002119 self.assertEqual('hello'.encode('utf-8'), b'hello')
Walter Dörwald67e83882007-05-05 12:26:27 +00002120 self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
2121 self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
2122 self.assertEqual('hello'.encode('latin-1'), b'hello')
Guido van Rossum97064862000-04-10 13:52:48 +00002123
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002124 # Default encoding is utf-8
2125 self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
2126
Walter Dörwald28256f22003-01-19 16:59:20 +00002127 # Roundtrip safety for BMP (just the first 1024 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002128 for c in range(1024):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002129 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002130 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
2131 'utf-16-be', 'raw_unicode_escape',
Inada Naoki6a16b182019-03-18 15:44:11 +09002132 'unicode_escape'):
2133 self.assertEqual(str(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002134
Walter Dörwald28256f22003-01-19 16:59:20 +00002135 # Roundtrip safety for BMP (just the first 256 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002136 for c in range(256):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002137 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002138 for encoding in ('latin-1',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002139 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002140
Walter Dörwald28256f22003-01-19 16:59:20 +00002141 # Roundtrip safety for BMP (just the first 128 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002142 for c in range(128):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002143 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002144 for encoding in ('ascii',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002145 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002146
Walter Dörwald28256f22003-01-19 16:59:20 +00002147 # Roundtrip safety for non-BMP (just a few chars)
Victor Stinner040e16e2011-11-15 22:44:05 +01002148 with warnings.catch_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01002149 u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
2150 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
Inada Naoki6a16b182019-03-18 15:44:11 +09002151 'raw_unicode_escape', 'unicode_escape'):
Victor Stinner040e16e2011-11-15 22:44:05 +01002152 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002153
Antoine Pitrou51f66482011-11-11 13:35:44 +01002154 # UTF-8 must be roundtrip safe for all code points
2155 # (except surrogates, which are forbidden).
2156 u = ''.join(map(chr, list(range(0, 0xd800)) +
Ezio Melotti40dc9192011-11-11 17:00:46 +02002157 list(range(0xe000, 0x110000))))
Walter Dörwald28256f22003-01-19 16:59:20 +00002158 for encoding in ('utf-8',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002159 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002160
Walter Dörwald28256f22003-01-19 16:59:20 +00002161 def test_codecs_charmap(self):
2162 # 0-127
Guido van Rossum805365e2007-05-07 22:24:25 +00002163 s = bytes(range(128))
Walter Dörwald28256f22003-01-19 16:59:20 +00002164 for encoding in (
Andrew Kuchlingad8156e2013-11-10 13:44:30 -05002165 'cp037', 'cp1026', 'cp273',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00002166 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2167 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02002168 'cp863', 'cp865', 'cp866', 'cp1125',
Walter Dörwald28256f22003-01-19 16:59:20 +00002169 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2170 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03002171 'iso8859_7', 'iso8859_9',
2172 'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'latin_1',
Walter Dörwald28256f22003-01-19 16:59:20 +00002173 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002174
Walter Dörwald28256f22003-01-19 16:59:20 +00002175 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2176 'cp1256', 'cp1257', 'cp1258',
2177 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002178
Walter Dörwald28256f22003-01-19 16:59:20 +00002179 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2180 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +00002181
Walter Dörwald28256f22003-01-19 16:59:20 +00002182 ### These have undefined mappings:
2183 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +00002184
Walter Dörwald28256f22003-01-19 16:59:20 +00002185 ### These fail the round-trip:
2186 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +00002187
Walter Dörwald28256f22003-01-19 16:59:20 +00002188 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002189 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002190
Walter Dörwald28256f22003-01-19 16:59:20 +00002191 # 128-255
Guido van Rossum805365e2007-05-07 22:24:25 +00002192 s = bytes(range(128, 256))
Walter Dörwald28256f22003-01-19 16:59:20 +00002193 for encoding in (
Andrew Kuchlingad8156e2013-11-10 13:44:30 -05002194 'cp037', 'cp1026', 'cp273',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00002195 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2196 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02002197 'cp863', 'cp865', 'cp866', 'cp1125',
Walter Dörwald28256f22003-01-19 16:59:20 +00002198 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2199 'iso8859_2', 'iso8859_4', 'iso8859_5',
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03002200 'iso8859_9', 'koi8_r', 'koi8_u', 'latin_1',
Walter Dörwald28256f22003-01-19 16:59:20 +00002201 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +00002202
Walter Dörwald28256f22003-01-19 16:59:20 +00002203 ### These have undefined mappings:
2204 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2205 #'cp1256', 'cp1257', 'cp1258',
2206 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03002207 #'iso8859_3', 'iso8859_6', 'iso8859_7', 'koi8_t', 'kz1048',
Walter Dörwald28256f22003-01-19 16:59:20 +00002208 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +00002209
Walter Dörwald28256f22003-01-19 16:59:20 +00002210 ### These fail the round-trip:
2211 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +00002212
Walter Dörwald28256f22003-01-19 16:59:20 +00002213 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002214 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002215
Walter Dörwald28256f22003-01-19 16:59:20 +00002216 def test_concatenation(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002217 self.assertEqual(("abc" "def"), "abcdef")
2218 self.assertEqual(("abc" "def"), "abcdef")
2219 self.assertEqual(("abc" "def"), "abcdef")
2220 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2221 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +00002222
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002223 def test_ucs4(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002224 x = '\U00100000'
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002225 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
2226 self.assertEqual(x, y)
2227
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002228 y = br'\U00100000'
2229 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2230 self.assertEqual(x, y)
2231 y = br'\U00010000'
2232 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2233 self.assertEqual(x, y)
Christian Heimesfe337bf2008-03-23 21:54:12 +00002234
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002235 try:
2236 br'\U11111111'.decode("raw-unicode-escape")
2237 except UnicodeDecodeError as e:
2238 self.assertEqual(e.start, 0)
2239 self.assertEqual(e.end, 10)
2240 else:
2241 self.fail("Should have raised UnicodeDecodeError")
Christian Heimesfe337bf2008-03-23 21:54:12 +00002242
Brett Cannonc3647ac2005-04-26 03:45:26 +00002243 def test_conversion(self):
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002244 # Make sure __str__() works properly
2245 class ObjectToStr:
Brett Cannonc3647ac2005-04-26 03:45:26 +00002246 def __str__(self):
2247 return "foo"
2248
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002249 class StrSubclassToStr(str):
Guido van Rossum98297ee2007-11-06 21:34:58 +00002250 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002251 return "foo"
Brett Cannonc3647ac2005-04-26 03:45:26 +00002252
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002253 class StrSubclassToStrSubclass(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002254 def __new__(cls, content=""):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002255 return str.__new__(cls, 2*content)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002256 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002257 return self
2258
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002259 self.assertEqual(str(ObjectToStr()), "foo")
2260 self.assertEqual(str(StrSubclassToStr("bar")), "foo")
2261 s = str(StrSubclassToStrSubclass("foo"))
2262 self.assertEqual(s, "foofoo")
2263 self.assertIs(type(s), StrSubclassToStrSubclass)
Serhiy Storchaka15095802015-11-25 15:47:01 +02002264 s = StrSubclass(StrSubclassToStrSubclass("foo"))
2265 self.assertEqual(s, "foofoo")
2266 self.assertIs(type(s), StrSubclass)
Brett Cannonc3647ac2005-04-26 03:45:26 +00002267
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002268 def test_unicode_repr(self):
2269 class s1:
2270 def __repr__(self):
2271 return '\\n'
2272
2273 class s2:
2274 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002275 return '\\n'
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002276
2277 self.assertEqual(repr(s1()), '\\n')
2278 self.assertEqual(repr(s2()), '\\n')
2279
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002280 def test_printable_repr(self):
2281 self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
Martin v. Löwisbaecd722010-10-11 22:42:28 +00002282 self.assertEqual(repr('\U00014000'), "'\\U00014000'") # nonprintable
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002283
Zachary Ware9fe6d862013-12-08 00:20:35 -06002284 # This test only affects 32-bit platforms because expandtabs can only take
2285 # an int as the max value, not a 64-bit C long. If expandtabs is changed
2286 # to take a 64-bit long, this test should apply to all platforms.
2287 @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4,
2288 'only applies to 32-bit platforms')
Guido van Rossumcd16bf62007-06-13 18:07:49 +00002289 def test_expandtabs_overflows_gracefully(self):
Christian Heimesa37d4c62007-12-04 23:02:19 +00002290 self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002291
Victor Stinner1d972ad2011-10-07 13:31:46 +02002292 @support.cpython_only
Antoine Pitroue19aa382011-10-04 16:04:01 +02002293 def test_expandtabs_optimization(self):
2294 s = 'abc'
2295 self.assertIs(s.expandtabs(), s)
2296
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00002297 def test_raiseMemError(self):
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002298 if struct.calcsize('P') == 8:
2299 # 64 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02002300 ascii_struct_size = 48
2301 compact_struct_size = 72
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002302 else:
2303 # 32 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02002304 ascii_struct_size = 24
2305 compact_struct_size = 36
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002306
2307 for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
2308 code = ord(char)
2309 if code < 0x100:
2310 char_size = 1 # sizeof(Py_UCS1)
2311 struct_size = ascii_struct_size
2312 elif code < 0x10000:
2313 char_size = 2 # sizeof(Py_UCS2)
2314 struct_size = compact_struct_size
2315 else:
2316 char_size = 4 # sizeof(Py_UCS4)
2317 struct_size = compact_struct_size
2318 # Note: sys.maxsize is half of the actual max allocation because of
Martin v. Löwis287eca62011-09-28 10:03:28 +02002319 # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
2320 # be allocatable, given enough memory.
2321 maxlen = ((sys.maxsize - struct_size) // char_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002322 alloc = lambda: char * maxlen
2323 self.assertRaises(MemoryError, alloc)
2324 self.assertRaises(MemoryError, alloc)
Antoine Pitrou3db3e872008-08-17 17:06:51 +00002325
Victor Stinner808fc0a2010-03-22 12:50:40 +00002326 def test_format_subclass(self):
2327 class S(str):
2328 def __str__(self):
2329 return '__str__ overridden'
2330 s = S('xxx')
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002331 self.assertEqual("%s" % s, '__str__ overridden')
2332 self.assertEqual("{}".format(s), '__str__ overridden')
Victor Stinner808fc0a2010-03-22 12:50:40 +00002333
Serhiy Storchaka63b5b6f2016-10-02 21:16:38 +03002334 def test_subclass_add(self):
2335 class S(str):
2336 def __add__(self, o):
2337 return "3"
2338 self.assertEqual(S("4") + S("5"), "3")
2339 class S(str):
2340 def __iadd__(self, o):
2341 return "3"
2342 s = S("1")
2343 s += "4"
2344 self.assertEqual(s, "3")
2345
2346 def test_getnewargs(self):
2347 text = 'abc'
2348 args = text.__getnewargs__()
2349 self.assertIsNot(args[0], text)
2350 self.assertEqual(args[0], text)
2351 self.assertEqual(len(args), 1)
2352
Inada Naoki6a16b182019-03-18 15:44:11 +09002353 @support.cpython_only
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03002354 @support.requires_legacy_unicode_capi
Serhiy Storchaka63b5b6f2016-10-02 21:16:38 +03002355 def test_resize(self):
Inada Naoki6a16b182019-03-18 15:44:11 +09002356 from _testcapi import getargs_u
Serhiy Storchaka63b5b6f2016-10-02 21:16:38 +03002357 for length in range(1, 100, 7):
2358 # generate a fresh string (refcount=1)
2359 text = 'a' * length + 'b'
2360
Inada Naoki6a16b182019-03-18 15:44:11 +09002361 # fill wstr internal field
2362 abc = getargs_u(text)
2363 self.assertEqual(abc, text)
Serhiy Storchaka63b5b6f2016-10-02 21:16:38 +03002364
Inada Naoki6a16b182019-03-18 15:44:11 +09002365 # resize text: wstr field must be cleared and then recomputed
2366 text += 'c'
2367 abcdef = getargs_u(text)
2368 self.assertNotEqual(abc, abcdef)
2369 self.assertEqual(abcdef, text)
Serhiy Storchaka63b5b6f2016-10-02 21:16:38 +03002370
2371 def test_compare(self):
2372 # Issue #17615
2373 N = 10
2374 ascii = 'a' * N
2375 ascii2 = 'z' * N
2376 latin = '\x80' * N
2377 latin2 = '\xff' * N
2378 bmp = '\u0100' * N
2379 bmp2 = '\uffff' * N
2380 astral = '\U00100000' * N
2381 astral2 = '\U0010ffff' * N
2382 strings = (
2383 ascii, ascii2,
2384 latin, latin2,
2385 bmp, bmp2,
2386 astral, astral2)
2387 for text1, text2 in itertools.combinations(strings, 2):
2388 equal = (text1 is text2)
2389 self.assertEqual(text1 == text2, equal)
2390 self.assertEqual(text1 != text2, not equal)
2391
2392 if equal:
2393 self.assertTrue(text1 <= text2)
2394 self.assertTrue(text1 >= text2)
2395
2396 # text1 is text2: duplicate strings to skip the "str1 == str2"
2397 # optimization in unicode_compare_eq() and really compare
2398 # character per character
2399 copy1 = duplicate_string(text1)
2400 copy2 = duplicate_string(text2)
2401 self.assertIsNot(copy1, copy2)
2402
2403 self.assertTrue(copy1 == copy2)
2404 self.assertFalse(copy1 != copy2)
2405
2406 self.assertTrue(copy1 <= copy2)
2407 self.assertTrue(copy2 >= copy2)
2408
2409 self.assertTrue(ascii < ascii2)
2410 self.assertTrue(ascii < latin)
2411 self.assertTrue(ascii < bmp)
2412 self.assertTrue(ascii < astral)
2413 self.assertFalse(ascii >= ascii2)
2414 self.assertFalse(ascii >= latin)
2415 self.assertFalse(ascii >= bmp)
2416 self.assertFalse(ascii >= astral)
2417
2418 self.assertFalse(latin < ascii)
2419 self.assertTrue(latin < latin2)
2420 self.assertTrue(latin < bmp)
2421 self.assertTrue(latin < astral)
2422 self.assertTrue(latin >= ascii)
2423 self.assertFalse(latin >= latin2)
2424 self.assertFalse(latin >= bmp)
2425 self.assertFalse(latin >= astral)
2426
2427 self.assertFalse(bmp < ascii)
2428 self.assertFalse(bmp < latin)
2429 self.assertTrue(bmp < bmp2)
2430 self.assertTrue(bmp < astral)
2431 self.assertTrue(bmp >= ascii)
2432 self.assertTrue(bmp >= latin)
2433 self.assertFalse(bmp >= bmp2)
2434 self.assertFalse(bmp >= astral)
2435
2436 self.assertFalse(astral < ascii)
2437 self.assertFalse(astral < latin)
2438 self.assertFalse(astral < bmp2)
2439 self.assertTrue(astral < astral2)
2440 self.assertTrue(astral >= ascii)
2441 self.assertTrue(astral >= latin)
2442 self.assertTrue(astral >= bmp2)
2443 self.assertFalse(astral >= astral2)
2444
2445 def test_free_after_iterating(self):
2446 support.check_free_after_iterating(self, iter, str)
2447 support.check_free_after_iterating(self, reversed, str)
2448
Victor Stinner22eb6892019-06-26 00:51:05 +02002449 def test_check_encoding_errors(self):
2450 # bpo-37388: str(bytes) and str.decode() must check encoding and errors
2451 # arguments in dev mode
2452 encodings = ('ascii', 'utf8', 'latin1')
2453 invalid = 'Boom, Shaka Laka, Boom!'
2454 code = textwrap.dedent(f'''
2455 import sys
2456 encodings = {encodings!r}
2457
2458 for data in (b'', b'short string'):
2459 try:
2460 str(data, encoding={invalid!r})
2461 except LookupError:
2462 pass
2463 else:
2464 sys.exit(21)
2465
2466 try:
2467 str(data, errors={invalid!r})
2468 except LookupError:
2469 pass
2470 else:
2471 sys.exit(22)
2472
2473 for encoding in encodings:
2474 try:
2475 str(data, encoding, errors={invalid!r})
2476 except LookupError:
2477 pass
2478 else:
2479 sys.exit(22)
2480
2481 for data in ('', 'short string'):
2482 try:
2483 data.encode(encoding={invalid!r})
2484 except LookupError:
2485 pass
2486 else:
2487 sys.exit(23)
2488
2489 try:
2490 data.encode(errors={invalid!r})
2491 except LookupError:
2492 pass
2493 else:
2494 sys.exit(24)
2495
2496 for encoding in encodings:
2497 try:
2498 data.encode(encoding, errors={invalid!r})
2499 except LookupError:
2500 pass
2501 else:
2502 sys.exit(24)
2503
2504 sys.exit(10)
2505 ''')
2506 proc = assert_python_failure('-X', 'dev', '-c', code)
2507 self.assertEqual(proc.rc, 10, proc)
2508
Serhiy Storchaka63b5b6f2016-10-02 21:16:38 +03002509
2510class CAPITest(unittest.TestCase):
2511
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002512 # Test PyUnicode_FromFormat()
Victor Stinner1205f272010-09-11 00:54:47 +00002513 def test_from_format(self):
Hai Shideb01622020-07-06 20:29:49 +08002514 import_helper.import_module('ctypes')
Victor Stinner15a11362012-10-06 23:48:20 +02002515 from ctypes import (
2516 pythonapi, py_object, sizeof,
Victor Stinner6d970f42011-03-02 00:04:25 +00002517 c_int, c_long, c_longlong, c_ssize_t,
Victor Stinner15a11362012-10-06 23:48:20 +02002518 c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002519 name = "PyUnicode_FromFormat"
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002520 _PyUnicode_FromFormat = getattr(pythonapi, name)
2521 _PyUnicode_FromFormat.restype = py_object
2522
2523 def PyUnicode_FromFormat(format, *args):
2524 cargs = tuple(
2525 py_object(arg) if isinstance(arg, str) else arg
2526 for arg in args)
2527 return _PyUnicode_FromFormat(format, *cargs)
Victor Stinner1205f272010-09-11 00:54:47 +00002528
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002529 def check_format(expected, format, *args):
2530 text = PyUnicode_FromFormat(format, *args)
2531 self.assertEqual(expected, text)
2532
Victor Stinner1205f272010-09-11 00:54:47 +00002533 # ascii format, non-ascii argument
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002534 check_format('ascii\x7f=unicode\xe9',
2535 b'ascii\x7f=%U', 'unicode\xe9')
Victor Stinner1205f272010-09-11 00:54:47 +00002536
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002537 # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
2538 # raises an error
Ezio Melottied3a7d22010-12-01 02:32:32 +00002539 self.assertRaisesRegex(ValueError,
R David Murray44b548d2016-09-08 13:59:53 -04002540 r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
Victor Stinner4c7db312010-09-12 07:51:18 +00002541 'string, got a non-ASCII byte: 0xe9$',
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002542 PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00002543
Victor Stinner96865452011-03-01 23:44:09 +00002544 # test "%c"
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002545 check_format('\uabcd',
2546 b'%c', c_int(0xabcd))
2547 check_format('\U0010ffff',
2548 b'%c', c_int(0x10ffff))
Serhiy Storchaka8eeae212013-06-23 20:12:14 +03002549 with self.assertRaises(OverflowError):
2550 PyUnicode_FromFormat(b'%c', c_int(0x110000))
Serhiy Storchaka31b1c8b2013-06-12 09:20:44 +03002551 # Issue #18183
Serhiy Storchakaf15ffe02013-06-12 09:28:20 +03002552 check_format('\U00010000\U00100000',
2553 b'%c%c', c_int(0x10000), c_int(0x100000))
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002554
Victor Stinner96865452011-03-01 23:44:09 +00002555 # test "%"
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002556 check_format('%',
2557 b'%')
2558 check_format('%',
2559 b'%%')
2560 check_format('%s',
2561 b'%%s')
2562 check_format('[%]',
2563 b'[%%]')
2564 check_format('%abc',
2565 b'%%%s', b'abc')
2566
2567 # truncated string
2568 check_format('abc',
2569 b'%.3s', b'abcdef')
2570 check_format('abc[\ufffd',
2571 b'%.5s', 'abc[\u20ac]'.encode('utf8'))
2572 check_format("'\\u20acABC'",
2573 b'%A', '\u20acABC')
2574 check_format("'\\u20",
2575 b'%.5A', '\u20acABCDEF')
2576 check_format("'\u20acABC'",
2577 b'%R', '\u20acABC')
2578 check_format("'\u20acA",
2579 b'%.3R', '\u20acABCDEF')
2580 check_format('\u20acAB',
2581 b'%.3S', '\u20acABCDEF')
2582 check_format('\u20acAB',
2583 b'%.3U', '\u20acABCDEF')
2584 check_format('\u20acAB',
2585 b'%.3V', '\u20acABCDEF', None)
2586 check_format('abc[\ufffd',
2587 b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
2588
2589 # following tests comes from #7330
2590 # test width modifier and precision modifier with %S
2591 check_format("repr= abc",
2592 b'repr=%5S', 'abc')
2593 check_format("repr=ab",
2594 b'repr=%.2S', 'abc')
2595 check_format("repr= ab",
2596 b'repr=%5.2S', 'abc')
2597
2598 # test width modifier and precision modifier with %R
2599 check_format("repr= 'abc'",
2600 b'repr=%8R', 'abc')
2601 check_format("repr='ab",
2602 b'repr=%.3R', 'abc')
2603 check_format("repr= 'ab",
2604 b'repr=%5.3R', 'abc')
2605
2606 # test width modifier and precision modifier with %A
2607 check_format("repr= 'abc'",
2608 b'repr=%8A', 'abc')
2609 check_format("repr='ab",
2610 b'repr=%.3A', 'abc')
2611 check_format("repr= 'ab",
2612 b'repr=%5.3A', 'abc')
2613
2614 # test width modifier and precision modifier with %s
2615 check_format("repr= abc",
2616 b'repr=%5s', b'abc')
2617 check_format("repr=ab",
2618 b'repr=%.2s', b'abc')
2619 check_format("repr= ab",
2620 b'repr=%5.2s', b'abc')
2621
2622 # test width modifier and precision modifier with %U
2623 check_format("repr= abc",
2624 b'repr=%5U', 'abc')
2625 check_format("repr=ab",
2626 b'repr=%.2U', 'abc')
2627 check_format("repr= ab",
2628 b'repr=%5.2U', 'abc')
2629
2630 # test width modifier and precision modifier with %V
2631 check_format("repr= abc",
2632 b'repr=%5V', 'abc', b'123')
2633 check_format("repr=ab",
2634 b'repr=%.2V', 'abc', b'123')
2635 check_format("repr= ab",
2636 b'repr=%5.2V', 'abc', b'123')
2637 check_format("repr= 123",
2638 b'repr=%5V', None, b'123')
2639 check_format("repr=12",
2640 b'repr=%.2V', None, b'123')
2641 check_format("repr= 12",
2642 b'repr=%5.2V', None, b'123')
Victor Stinner96865452011-03-01 23:44:09 +00002643
Victor Stinner6d970f42011-03-02 00:04:25 +00002644 # test integer formats (%i, %d, %u)
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002645 check_format('010',
2646 b'%03i', c_int(10))
2647 check_format('0010',
2648 b'%0.4i', c_int(10))
2649 check_format('-123',
2650 b'%i', c_int(-123))
2651 check_format('-123',
2652 b'%li', c_long(-123))
2653 check_format('-123',
2654 b'%lli', c_longlong(-123))
2655 check_format('-123',
2656 b'%zi', c_ssize_t(-123))
Victor Stinner96865452011-03-01 23:44:09 +00002657
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002658 check_format('-123',
2659 b'%d', c_int(-123))
2660 check_format('-123',
2661 b'%ld', c_long(-123))
2662 check_format('-123',
2663 b'%lld', c_longlong(-123))
2664 check_format('-123',
2665 b'%zd', c_ssize_t(-123))
Victor Stinner96865452011-03-01 23:44:09 +00002666
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002667 check_format('123',
2668 b'%u', c_uint(123))
2669 check_format('123',
2670 b'%lu', c_ulong(123))
2671 check_format('123',
2672 b'%llu', c_ulonglong(123))
2673 check_format('123',
2674 b'%zu', c_size_t(123))
Victor Stinner6d970f42011-03-02 00:04:25 +00002675
Victor Stinner15a11362012-10-06 23:48:20 +02002676 # test long output
2677 min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
2678 max_longlong = -min_longlong - 1
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002679 check_format(str(min_longlong),
2680 b'%lld', c_longlong(min_longlong))
2681 check_format(str(max_longlong),
2682 b'%lld', c_longlong(max_longlong))
Victor Stinner15a11362012-10-06 23:48:20 +02002683 max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002684 check_format(str(max_ulonglong),
2685 b'%llu', c_ulonglong(max_ulonglong))
Victor Stinner15a11362012-10-06 23:48:20 +02002686 PyUnicode_FromFormat(b'%p', c_void_p(-1))
2687
Victor Stinnere215d962012-10-06 23:03:36 +02002688 # test padding (width and/or precision)
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002689 check_format('123'.rjust(10, '0'),
2690 b'%010i', c_int(123))
2691 check_format('123'.rjust(100),
2692 b'%100i', c_int(123))
2693 check_format('123'.rjust(100, '0'),
2694 b'%.100i', c_int(123))
2695 check_format('123'.rjust(80, '0').rjust(100),
2696 b'%100.80i', c_int(123))
Victor Stinnere215d962012-10-06 23:03:36 +02002697
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002698 check_format('123'.rjust(10, '0'),
2699 b'%010u', c_uint(123))
2700 check_format('123'.rjust(100),
2701 b'%100u', c_uint(123))
2702 check_format('123'.rjust(100, '0'),
2703 b'%.100u', c_uint(123))
2704 check_format('123'.rjust(80, '0').rjust(100),
2705 b'%100.80u', c_uint(123))
Victor Stinnere215d962012-10-06 23:03:36 +02002706
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002707 check_format('123'.rjust(10, '0'),
2708 b'%010x', c_int(0x123))
2709 check_format('123'.rjust(100),
2710 b'%100x', c_int(0x123))
2711 check_format('123'.rjust(100, '0'),
2712 b'%.100x', c_int(0x123))
2713 check_format('123'.rjust(80, '0').rjust(100),
2714 b'%100.80x', c_int(0x123))
Victor Stinnere215d962012-10-06 23:03:36 +02002715
Victor Stinner6d970f42011-03-02 00:04:25 +00002716 # test %A
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002717 check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
2718 b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
Victor Stinner9a909002010-10-18 20:59:24 +00002719
Victor Stinner6d970f42011-03-02 00:04:25 +00002720 # test %V
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002721 check_format('repr=abc',
2722 b'repr=%V', 'abc', b'xyz')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002723
2724 # Test string decode from parameter of %s using utf-8.
2725 # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
2726 # '\u4eba\u6c11'
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002727 check_format('repr=\u4eba\u6c11',
2728 b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002729
2730 #Test replace error handler.
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002731 check_format('repr=abc\ufffd',
2732 b'repr=%V', None, b'abc\xff')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002733
Victor Stinner6d970f42011-03-02 00:04:25 +00002734 # not supported: copy the raw format string. these tests are just here
Martin Panter2f9171d2016-12-18 01:23:09 +00002735 # to check for crashes and should not be considered as specifications
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002736 check_format('%s',
2737 b'%1%s', b'abc')
2738 check_format('%1abc',
2739 b'%1abc')
2740 check_format('%+i',
2741 b'%+i', c_int(10))
2742 check_format('%.%s',
2743 b'%.%s', b'abc')
Victor Stinner6d970f42011-03-02 00:04:25 +00002744
Serhiy Storchaka44cc4822019-01-12 09:22:29 +02002745 # Issue #33817: empty strings
2746 check_format('',
2747 b'')
2748 check_format('',
2749 b'%s', b'')
2750
Victor Stinner1c24bd02010-10-02 11:03:13 +00002751 # Test PyUnicode_AsWideChar()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002752 @support.cpython_only
Victor Stinner1c24bd02010-10-02 11:03:13 +00002753 def test_aswidechar(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002754 from _testcapi import unicode_aswidechar
Hai Shideb01622020-07-06 20:29:49 +08002755 import_helper.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002756 from ctypes import c_wchar, sizeof
2757
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002758 wchar, size = unicode_aswidechar('abcdef', 2)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002759 self.assertEqual(size, 2)
2760 self.assertEqual(wchar, 'ab')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002761
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002762 wchar, size = unicode_aswidechar('abc', 3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002763 self.assertEqual(size, 3)
2764 self.assertEqual(wchar, 'abc')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002765
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002766 wchar, size = unicode_aswidechar('abc', 4)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002767 self.assertEqual(size, 3)
2768 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002769
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002770 wchar, size = unicode_aswidechar('abc', 10)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002771 self.assertEqual(size, 3)
2772 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002773
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002774 wchar, size = unicode_aswidechar('abc\0def', 20)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002775 self.assertEqual(size, 7)
2776 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002777
Victor Stinner5593d8a2010-10-02 11:11:27 +00002778 nonbmp = chr(0x10ffff)
2779 if sizeof(c_wchar) == 2:
2780 buflen = 3
2781 nchar = 2
2782 else: # sizeof(c_wchar) == 4
2783 buflen = 2
2784 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002785 wchar, size = unicode_aswidechar(nonbmp, buflen)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002786 self.assertEqual(size, nchar)
2787 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00002788
Victor Stinner1c24bd02010-10-02 11:03:13 +00002789 # Test PyUnicode_AsWideCharString()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002790 @support.cpython_only
Victor Stinner1c24bd02010-10-02 11:03:13 +00002791 def test_aswidecharstring(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002792 from _testcapi import unicode_aswidecharstring
Hai Shideb01622020-07-06 20:29:49 +08002793 import_helper.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002794 from ctypes import c_wchar, sizeof
2795
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002796 wchar, size = unicode_aswidecharstring('abc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00002797 self.assertEqual(size, 3)
2798 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002799
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002800 wchar, size = unicode_aswidecharstring('abc\0def')
Ezio Melottib3aedd42010-11-20 19:04:17 +00002801 self.assertEqual(size, 7)
2802 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002803
Victor Stinner5593d8a2010-10-02 11:11:27 +00002804 nonbmp = chr(0x10ffff)
2805 if sizeof(c_wchar) == 2:
2806 nchar = 2
2807 else: # sizeof(c_wchar) == 4
2808 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002809 wchar, size = unicode_aswidecharstring(nonbmp)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002810 self.assertEqual(size, nchar)
2811 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00002812
Serhiy Storchakacc164232016-10-02 21:29:26 +03002813 # Test PyUnicode_AsUCS4()
2814 @support.cpython_only
2815 def test_asucs4(self):
2816 from _testcapi import unicode_asucs4
2817 for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600',
2818 'a\ud800b\udfffc', '\ud834\udd1e']:
2819 l = len(s)
Serhiy Storchaka1f21eaa2019-09-01 12:16:51 +03002820 self.assertEqual(unicode_asucs4(s, l, True), s+'\0')
2821 self.assertEqual(unicode_asucs4(s, l, False), s+'\uffff')
2822 self.assertEqual(unicode_asucs4(s, l+1, True), s+'\0\uffff')
2823 self.assertEqual(unicode_asucs4(s, l+1, False), s+'\0\uffff')
2824 self.assertRaises(SystemError, unicode_asucs4, s, l-1, True)
2825 self.assertRaises(SystemError, unicode_asucs4, s, l-2, False)
Serhiy Storchakacc164232016-10-02 21:29:26 +03002826 s = '\0'.join([s, s])
Serhiy Storchaka1f21eaa2019-09-01 12:16:51 +03002827 self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0')
2828 self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff')
Serhiy Storchakacc164232016-10-02 21:29:26 +03002829
Hai Shi5623ac82019-07-20 02:56:23 -05002830 # Test PyUnicode_AsUTF8()
2831 @support.cpython_only
2832 def test_asutf8(self):
2833 from _testcapi import unicode_asutf8
2834
2835 bmp = '\u0100'
2836 bmp2 = '\uffff'
2837 nonbmp = chr(0x10ffff)
2838
2839 self.assertEqual(unicode_asutf8(bmp), b'\xc4\x80')
2840 self.assertEqual(unicode_asutf8(bmp2), b'\xef\xbf\xbf')
2841 self.assertEqual(unicode_asutf8(nonbmp), b'\xf4\x8f\xbf\xbf')
2842 self.assertRaises(UnicodeEncodeError, unicode_asutf8, 'a\ud800b\udfffc')
2843
2844 # Test PyUnicode_AsUTF8AndSize()
2845 @support.cpython_only
2846 def test_asutf8andsize(self):
2847 from _testcapi import unicode_asutf8andsize
2848
2849 bmp = '\u0100'
2850 bmp2 = '\uffff'
2851 nonbmp = chr(0x10ffff)
2852
2853 self.assertEqual(unicode_asutf8andsize(bmp), (b'\xc4\x80', 2))
2854 self.assertEqual(unicode_asutf8andsize(bmp2), (b'\xef\xbf\xbf', 3))
2855 self.assertEqual(unicode_asutf8andsize(nonbmp), (b'\xf4\x8f\xbf\xbf', 4))
2856 self.assertRaises(UnicodeEncodeError, unicode_asutf8andsize, 'a\ud800b\udfffc')
2857
Xiang Zhangb2110682016-12-20 22:52:33 +08002858 # Test PyUnicode_FindChar()
2859 @support.cpython_only
2860 def test_findchar(self):
2861 from _testcapi import unicode_findchar
2862
2863 for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1":
2864 for i, ch in enumerate(str):
2865 self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i)
2866 self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i)
2867
2868 str = "!>_<!"
2869 self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1)
2870 self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1)
2871 # start < end
2872 self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4)
2873 self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4)
2874 # start >= end
2875 self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1)
2876 self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1)
2877 # negative
2878 self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0)
2879 self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0)
2880
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03002881 # Test PyUnicode_CopyCharacters()
2882 @support.cpython_only
2883 def test_copycharacters(self):
2884 from _testcapi import unicode_copycharacters
2885
2886 strings = [
2887 'abcde', '\xa1\xa2\xa3\xa4\xa5',
2888 '\u4f60\u597d\u4e16\u754c\uff01',
2889 '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604'
2890 ]
2891
2892 for idx, from_ in enumerate(strings):
2893 # wide -> narrow: exceed maxchar limitation
2894 for to in strings[:idx]:
2895 self.assertRaises(
2896 SystemError,
2897 unicode_copycharacters, to, 0, from_, 0, 5
2898 )
2899 # same kind
2900 for from_start in range(5):
2901 self.assertEqual(
2902 unicode_copycharacters(from_, 0, from_, from_start, 5),
2903 (from_[from_start:from_start+5].ljust(5, '\0'),
2904 5-from_start)
2905 )
2906 for to_start in range(5):
2907 self.assertEqual(
2908 unicode_copycharacters(from_, to_start, from_, to_start, 5),
2909 (from_[to_start:to_start+5].rjust(5, '\0'),
2910 5-to_start)
2911 )
2912 # narrow -> wide
2913 # Tests omitted since this creates invalid strings.
2914
2915 s = strings[0]
2916 self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5)
2917 self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5)
2918 self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5)
2919 self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5)
2920 self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5)
2921 self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1)
2922 self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)
2923
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002924 @support.cpython_only
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03002925 @support.requires_legacy_unicode_capi
Victor Stinner42bf7752011-11-21 22:52:58 +01002926 def test_encode_decimal(self):
2927 from _testcapi import unicode_encodedecimal
2928 self.assertEqual(unicode_encodedecimal('123'),
2929 b'123')
2930 self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
2931 b'3.14')
2932 self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2933 b' 3.14 ')
2934 self.assertRaises(UnicodeEncodeError,
2935 unicode_encodedecimal, "123\u20ac", "strict")
Victor Stinner6345be92011-11-25 20:09:01 +01002936 self.assertRaisesRegex(
2937 ValueError,
2938 "^'decimal' codec can't encode character",
2939 unicode_encodedecimal, "123\u20ac", "replace")
Victor Stinner42bf7752011-11-21 22:52:58 +01002940
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002941 @support.cpython_only
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03002942 @support.requires_legacy_unicode_capi
Victor Stinner42bf7752011-11-21 22:52:58 +01002943 def test_transform_decimal(self):
2944 from _testcapi import unicode_transformdecimaltoascii as transform_decimal
2945 self.assertEqual(transform_decimal('123'),
2946 '123')
2947 self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
2948 '3.14')
2949 self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2950 "\N{EM SPACE}3.14\N{EN SPACE}")
2951 self.assertEqual(transform_decimal('123\u20ac'),
2952 '123\u20ac')
2953
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02002954 @support.cpython_only
2955 def test_pep393_utf8_caching_bug(self):
2956 # Issue #25709: Problem with string concatenation and utf-8 cache
2957 from _testcapi import getargs_s_hash
2958 for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
2959 s = ''
2960 for i in range(5):
2961 # Due to CPython specific optimization the 's' string can be
2962 # resized in-place.
2963 s += chr(k)
2964 # Parsing with the "s#" format code calls indirectly
2965 # PyUnicode_AsUTF8AndSize() which creates the UTF-8
2966 # encoded string cached in the Unicode object.
2967 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2968 # Check that the second call returns the same result
2969 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2970
Eric Smitha1eac722011-01-29 11:15:35 +00002971class StringModuleTest(unittest.TestCase):
2972 def test_formatter_parser(self):
2973 def parse(format):
2974 return list(_string.formatter_parser(format))
2975
2976 formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
2977 self.assertEqual(formatter, [
2978 ('prefix ', '2', '', 's'),
2979 ('xxx', '0', '^+10.3f', None),
2980 ('', 'obj.attr', '', 's'),
2981 (' ', 'z[0]', '10', 's'),
2982 ])
2983
2984 formatter = parse("prefix {} suffix")
2985 self.assertEqual(formatter, [
2986 ('prefix ', '', '', None),
2987 (' suffix', None, None, None),
2988 ])
2989
2990 formatter = parse("str")
2991 self.assertEqual(formatter, [
2992 ('str', None, None, None),
2993 ])
2994
2995 formatter = parse("")
2996 self.assertEqual(formatter, [])
2997
2998 formatter = parse("{0}")
2999 self.assertEqual(formatter, [
3000 ('', '0', '', None),
3001 ])
3002
3003 self.assertRaises(TypeError, _string.formatter_parser, 1)
3004
3005 def test_formatter_field_name_split(self):
3006 def split(name):
3007 items = list(_string.formatter_field_name_split(name))
3008 items[1] = list(items[1])
3009 return items
3010 self.assertEqual(split("obj"), ["obj", []])
3011 self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
3012 self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
3013 self.assertEqual(split("obj.arg[key1][key2]"), [
3014 "obj",
3015 [(True, 'arg'),
3016 (False, 'key1'),
3017 (False, 'key2'),
3018 ]])
3019 self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
3020
3021
Walter Dörwald28256f22003-01-19 16:59:20 +00003022if __name__ == "__main__":
Ezio Melotti0dceb562013-01-10 07:43:26 +02003023 unittest.main()