blob: 177d80d27e1a41d1cae466b123d80f34fcf94d09 [file] [log] [blame]
Guido van Rossuma831cac2000-03-10 23:23:21 +00001""" Test script for the Unicode implementation.
2
Guido van Rossuma831cac2000-03-10 23:23:21 +00003Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03007"""
Victor Stinner040e16e2011-11-15 22:44:05 +01008import _string
Guido van Rossum98297ee2007-11-06 21:34:58 +00009import codecs
Victor Stinner9fc59812013-04-08 22:34:43 +020010import itertools
Ethan Furman9ab74802014-03-21 06:38:46 -070011import operator
Guido van Rossum98297ee2007-11-06 21:34:58 +000012import struct
13import sys
Victor Stinner22eb6892019-06-26 00:51:05 +020014import textwrap
Guido van Rossum98297ee2007-11-06 21:34:58 +000015import unittest
16import warnings
Benjamin Petersonee8712c2008-05-20 21:35:26 +000017from test import support, string_tests
Victor Stinner22eb6892019-06-26 00:51:05 +020018from test.support.script_helper import assert_python_failure
Guido van Rossuma831cac2000-03-10 23:23:21 +000019
Neal Norwitz430f68b2005-11-24 22:00:56 +000020# Error handling (bad decoder return)
21def search_function(encoding):
22 def decode1(input, errors="strict"):
23 return 42 # not a tuple
24 def encode1(input, errors="strict"):
25 return 42 # not a tuple
26 def encode2(input, errors="strict"):
27 return (42, 42) # no unicode
28 def decode2(input, errors="strict"):
29 return (42, 42) # no unicode
30 if encoding=="test.unicode1":
31 return (encode1, decode1, None, None)
32 elif encoding=="test.unicode2":
33 return (encode2, decode2, None, None)
34 else:
35 return None
36codecs.register(search_function)
37
Victor Stinner9fc59812013-04-08 22:34:43 +020038def duplicate_string(text):
39 """
40 Try to get a fresh clone of the specified text:
41 new object with a reference count of 1.
42
43 This is a best-effort: latin1 single letters and the empty
44 string ('') are singletons and cannot be cloned.
45 """
46 return text.encode().decode()
47
Serhiy Storchaka15095802015-11-25 15:47:01 +020048class StrSubclass(str):
49 pass
50
Brett Cannon226b2302010-03-20 22:22:22 +000051class UnicodeTest(string_tests.CommonTest,
52 string_tests.MixinStrUnicodeUserStringTest,
Ezio Melotti0dceb562013-01-10 07:43:26 +020053 string_tests.MixinStrUnicodeTest,
54 unittest.TestCase):
Brett Cannon226b2302010-03-20 22:22:22 +000055
Guido van Rossumef87d6e2007-05-02 19:09:54 +000056 type2test = str
Walter Dörwald0fd583c2003-02-21 12:53:50 +000057
58 def checkequalnofix(self, result, object, methodname, *args):
59 method = getattr(object, methodname)
60 realresult = method(*args)
61 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000062 self.assertTrue(type(realresult) is type(result))
Walter Dörwald0fd583c2003-02-21 12:53:50 +000063
64 # if the original is returned make sure that
65 # this doesn't happen with subclasses
66 if realresult is object:
Guido van Rossumef87d6e2007-05-02 19:09:54 +000067 class usub(str):
Walter Dörwald0fd583c2003-02-21 12:53:50 +000068 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000069 return 'usub(%r)' % str.__repr__(self)
Walter Dörwald0fd583c2003-02-21 12:53:50 +000070 object = usub(object)
71 method = getattr(object, methodname)
72 realresult = method(*args)
73 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000074 self.assertTrue(object is not realresult)
Guido van Rossume4874ae2001-09-21 15:36:41 +000075
Jeremy Hylton504de6b2003-10-06 05:08:26 +000076 def test_literals(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000077 self.assertEqual('\xff', '\u00ff')
78 self.assertEqual('\uffff', '\U0000ffff')
Guido van Rossum36e0a922007-07-20 04:05:57 +000079 self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
80 self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
81 self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
Benjamin Petersoncd76c272008-04-05 15:09:30 +000082 # raw strings should not have unicode escapes
Florent Xiclunaa87b3832010-09-13 02:28:18 +000083 self.assertNotEqual(r"\u0020", " ")
Jeremy Hylton504de6b2003-10-06 05:08:26 +000084
Georg Brandl559e5d72008-06-11 18:37:52 +000085 def test_ascii(self):
86 if not sys.platform.startswith('java'):
87 # Test basic sanity of repr()
88 self.assertEqual(ascii('abc'), "'abc'")
89 self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
90 self.assertEqual(ascii('ab\\'), "'ab\\\\'")
91 self.assertEqual(ascii('\\c'), "'\\\\c'")
92 self.assertEqual(ascii('\\'), "'\\\\'")
93 self.assertEqual(ascii('\n'), "'\\n'")
94 self.assertEqual(ascii('\r'), "'\\r'")
95 self.assertEqual(ascii('\t'), "'\\t'")
96 self.assertEqual(ascii('\b'), "'\\x08'")
97 self.assertEqual(ascii("'\""), """'\\'"'""")
98 self.assertEqual(ascii("'\""), """'\\'"'""")
99 self.assertEqual(ascii("'"), '''"'"''')
100 self.assertEqual(ascii('"'), """'"'""")
101 latin1repr = (
102 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
103 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
104 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
105 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
106 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
107 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
108 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
109 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
110 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
111 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
112 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
113 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
114 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
115 "\\xfe\\xff'")
116 testrepr = ascii(''.join(map(chr, range(256))))
117 self.assertEqual(testrepr, latin1repr)
118 # Test ascii works on wide unicode escapes without overflow.
119 self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
120 ascii("\U00010000" * 39 + "\uffff" * 4096))
121
122 class WrongRepr:
123 def __repr__(self):
124 return b'byte-repr'
125 self.assertRaises(TypeError, ascii, WrongRepr())
126
Walter Dörwald28256f22003-01-19 16:59:20 +0000127 def test_repr(self):
128 if not sys.platform.startswith('java'):
129 # Test basic sanity of repr()
Walter Dörwald67e83882007-05-05 12:26:27 +0000130 self.assertEqual(repr('abc'), "'abc'")
131 self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
132 self.assertEqual(repr('ab\\'), "'ab\\\\'")
133 self.assertEqual(repr('\\c'), "'\\\\c'")
134 self.assertEqual(repr('\\'), "'\\\\'")
135 self.assertEqual(repr('\n'), "'\\n'")
136 self.assertEqual(repr('\r'), "'\\r'")
137 self.assertEqual(repr('\t'), "'\\t'")
138 self.assertEqual(repr('\b'), "'\\x08'")
139 self.assertEqual(repr("'\""), """'\\'"'""")
140 self.assertEqual(repr("'\""), """'\\'"'""")
141 self.assertEqual(repr("'"), '''"'"''')
142 self.assertEqual(repr('"'), """'"'""")
Walter Dörwald28256f22003-01-19 16:59:20 +0000143 latin1repr = (
Walter Dörwald67e83882007-05-05 12:26:27 +0000144 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
Walter Dörwald28256f22003-01-19 16:59:20 +0000145 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
146 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
147 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
148 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
149 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
Georg Brandl559e5d72008-06-11 18:37:52 +0000150 "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
151 "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
152 "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
153 "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
154 "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
155 "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
156 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
157 "\xfe\xff'")
Guido van Rossum805365e2007-05-07 22:24:25 +0000158 testrepr = repr(''.join(map(chr, range(256))))
Walter Dörwald28256f22003-01-19 16:59:20 +0000159 self.assertEqual(testrepr, latin1repr)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000160 # Test repr works on wide unicode escapes without overflow.
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000161 self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
162 repr("\U00010000" * 39 + "\uffff" * 4096))
Walter Dörwald28256f22003-01-19 16:59:20 +0000163
Georg Brandl559e5d72008-06-11 18:37:52 +0000164 class WrongRepr:
165 def __repr__(self):
166 return b'byte-repr'
167 self.assertRaises(TypeError, repr, WrongRepr())
168
Guido van Rossum49d6b072006-08-17 21:11:47 +0000169 def test_iterators(self):
170 # Make sure unicode objects have an __iter__ method
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000171 it = "\u1111\u2222\u3333".__iter__()
172 self.assertEqual(next(it), "\u1111")
173 self.assertEqual(next(it), "\u2222")
174 self.assertEqual(next(it), "\u3333")
Georg Brandla18af4e2007-04-21 15:47:16 +0000175 self.assertRaises(StopIteration, next, it)
Guido van Rossum49d6b072006-08-17 21:11:47 +0000176
Walter Dörwald28256f22003-01-19 16:59:20 +0000177 def test_count(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000178 string_tests.CommonTest.test_count(self)
179 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000180 self.checkequalnofix(3, 'aaa', 'count', 'a')
181 self.checkequalnofix(0, 'aaa', 'count', 'b')
182 self.checkequalnofix(3, 'aaa', 'count', 'a')
183 self.checkequalnofix(0, 'aaa', 'count', 'b')
184 self.checkequalnofix(0, 'aaa', 'count', 'b')
185 self.checkequalnofix(1, 'aaa', 'count', 'a', -1)
186 self.checkequalnofix(3, 'aaa', 'count', 'a', -10)
187 self.checkequalnofix(2, 'aaa', 'count', 'a', 0, -1)
188 self.checkequalnofix(0, 'aaa', 'count', 'a', 0, -10)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200189 # test mixed kinds
190 self.checkequal(10, '\u0102' + 'a' * 10, 'count', 'a')
191 self.checkequal(10, '\U00100304' + 'a' * 10, 'count', 'a')
192 self.checkequal(10, '\U00100304' + '\u0102' * 10, 'count', '\u0102')
193 self.checkequal(0, 'a' * 10, 'count', '\u0102')
194 self.checkequal(0, 'a' * 10, 'count', '\U00100304')
195 self.checkequal(0, '\u0102' * 10, 'count', '\U00100304')
196 self.checkequal(10, '\u0102' + 'a_' * 10, 'count', 'a_')
197 self.checkequal(10, '\U00100304' + 'a_' * 10, 'count', 'a_')
198 self.checkequal(10, '\U00100304' + '\u0102_' * 10, 'count', '\u0102_')
199 self.checkequal(0, 'a' * 10, 'count', 'a\u0102')
200 self.checkequal(0, 'a' * 10, 'count', 'a\U00100304')
201 self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000202
Walter Dörwald28256f22003-01-19 16:59:20 +0000203 def test_find(self):
Antoine Pitrouc0bbe7d2011-10-08 22:41:35 +0200204 string_tests.CommonTest.test_find(self)
Antoine Pitrou2c3b2302011-10-11 20:29:21 +0200205 # test implementation details of the memchr fast path
206 self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102')
207 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201')
208 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120')
209 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220')
210 self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304')
211 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204')
212 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004')
213 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000214 self.checkequalnofix(0, 'abcdefghiabc', 'find', 'abc')
215 self.checkequalnofix(9, 'abcdefghiabc', 'find', 'abc', 1)
216 self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000217
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000218 self.assertRaises(TypeError, 'hello'.find)
219 self.assertRaises(TypeError, 'hello'.find, 42)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200220 # test mixed kinds
221 self.checkequal(100, '\u0102' * 100 + 'a', 'find', 'a')
222 self.checkequal(100, '\U00100304' * 100 + 'a', 'find', 'a')
223 self.checkequal(100, '\U00100304' * 100 + '\u0102', 'find', '\u0102')
224 self.checkequal(-1, 'a' * 100, 'find', '\u0102')
225 self.checkequal(-1, 'a' * 100, 'find', '\U00100304')
226 self.checkequal(-1, '\u0102' * 100, 'find', '\U00100304')
227 self.checkequal(100, '\u0102' * 100 + 'a_', 'find', 'a_')
228 self.checkequal(100, '\U00100304' * 100 + 'a_', 'find', 'a_')
229 self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'find', '\u0102_')
230 self.checkequal(-1, 'a' * 100, 'find', 'a\u0102')
231 self.checkequal(-1, 'a' * 100, 'find', 'a\U00100304')
232 self.checkequal(-1, '\u0102' * 100, 'find', '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000233
Walter Dörwald28256f22003-01-19 16:59:20 +0000234 def test_rfind(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000235 string_tests.CommonTest.test_rfind(self)
Antoine Pitrou2c3b2302011-10-11 20:29:21 +0200236 # test implementation details of the memrchr fast path
237 self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102')
238 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201')
239 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120')
240 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220')
241 self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304')
242 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204')
243 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000244 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000245 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', 'abc')
246 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
247 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200248 # test mixed kinds
249 self.checkequal(0, 'a' + '\u0102' * 100, 'rfind', 'a')
250 self.checkequal(0, 'a' + '\U00100304' * 100, 'rfind', 'a')
251 self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rfind', '\u0102')
252 self.checkequal(-1, 'a' * 100, 'rfind', '\u0102')
253 self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304')
254 self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304')
255 self.checkequal(0, '_a' + '\u0102' * 100, 'rfind', '_a')
256 self.checkequal(0, '_a' + '\U00100304' * 100, 'rfind', '_a')
257 self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rfind', '_\u0102')
258 self.checkequal(-1, 'a' * 100, 'rfind', '\u0102a')
259 self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304a')
260 self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304\u0102')
Guido van Rossum8b264542000-12-19 02:22:31 +0000261
Walter Dörwald28256f22003-01-19 16:59:20 +0000262 def test_index(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000263 string_tests.CommonTest.test_index(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000264 self.checkequalnofix(0, 'abcdefghiabc', 'index', '')
265 self.checkequalnofix(3, 'abcdefghiabc', 'index', 'def')
266 self.checkequalnofix(0, 'abcdefghiabc', 'index', 'abc')
267 self.checkequalnofix(9, 'abcdefghiabc', 'index', 'abc', 1)
268 self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
269 self.assertRaises(ValueError, 'abcdefghiab'.index, 'abc', 1)
270 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', 8)
271 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', -1)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200272 # test mixed kinds
273 self.checkequal(100, '\u0102' * 100 + 'a', 'index', 'a')
274 self.checkequal(100, '\U00100304' * 100 + 'a', 'index', 'a')
275 self.checkequal(100, '\U00100304' * 100 + '\u0102', 'index', '\u0102')
276 self.assertRaises(ValueError, ('a' * 100).index, '\u0102')
277 self.assertRaises(ValueError, ('a' * 100).index, '\U00100304')
278 self.assertRaises(ValueError, ('\u0102' * 100).index, '\U00100304')
279 self.checkequal(100, '\u0102' * 100 + 'a_', 'index', 'a_')
280 self.checkequal(100, '\U00100304' * 100 + 'a_', 'index', 'a_')
281 self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'index', '\u0102_')
282 self.assertRaises(ValueError, ('a' * 100).index, 'a\u0102')
283 self.assertRaises(ValueError, ('a' * 100).index, 'a\U00100304')
284 self.assertRaises(ValueError, ('\u0102' * 100).index, '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000285
Walter Dörwald28256f22003-01-19 16:59:20 +0000286 def test_rindex(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000287 string_tests.CommonTest.test_rindex(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000288 self.checkequalnofix(12, 'abcdefghiabc', 'rindex', '')
289 self.checkequalnofix(3, 'abcdefghiabc', 'rindex', 'def')
290 self.checkequalnofix(9, 'abcdefghiabc', 'rindex', 'abc')
291 self.checkequalnofix(0, 'abcdefghiabc', 'rindex', 'abc', 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000292
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000293 self.assertRaises(ValueError, 'abcdefghiabc'.rindex, 'hib')
294 self.assertRaises(ValueError, 'defghiabc'.rindex, 'def', 1)
295 self.assertRaises(ValueError, 'defghiabc'.rindex, 'abc', 0, -1)
296 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, 8)
297 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, -1)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200298 # test mixed kinds
299 self.checkequal(0, 'a' + '\u0102' * 100, 'rindex', 'a')
300 self.checkequal(0, 'a' + '\U00100304' * 100, 'rindex', 'a')
301 self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rindex', '\u0102')
302 self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102')
303 self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304')
304 self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304')
305 self.checkequal(0, '_a' + '\u0102' * 100, 'rindex', '_a')
306 self.checkequal(0, '_a' + '\U00100304' * 100, 'rindex', '_a')
307 self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rindex', '_\u0102')
308 self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102a')
309 self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304a')
310 self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304\u0102')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000311
Georg Brandlceee0772007-11-27 23:48:05 +0000312 def test_maketrans_translate(self):
313 # these work with plain translate()
314 self.checkequalnofix('bbbc', 'abababc', 'translate',
315 {ord('a'): None})
316 self.checkequalnofix('iiic', 'abababc', 'translate',
317 {ord('a'): None, ord('b'): ord('i')})
318 self.checkequalnofix('iiix', 'abababc', 'translate',
319 {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
320 self.checkequalnofix('c', 'abababc', 'translate',
321 {ord('a'): None, ord('b'): ''})
322 self.checkequalnofix('xyyx', 'xzx', 'translate',
323 {ord('z'): 'yy'})
Victor Stinner5a29f252014-04-05 00:17:51 +0200324
Georg Brandlceee0772007-11-27 23:48:05 +0000325 # this needs maketrans()
326 self.checkequalnofix('abababc', 'abababc', 'translate',
327 {'b': '<i>'})
328 tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
329 self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
330 # test alternative way of calling maketrans()
331 tbl = self.type2test.maketrans('abc', 'xyz', 'd')
332 self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
333
Victor Stinner5a29f252014-04-05 00:17:51 +0200334 # various tests switching from ASCII to latin1 or the opposite;
335 # same length, remove a letter, or replace with a longer string.
336 self.assertEqual("[a]".translate(str.maketrans('a', 'X')),
337 "[X]")
338 self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})),
339 "[X]")
340 self.assertEqual("[a]".translate(str.maketrans({'a': None})),
341 "[]")
342 self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})),
343 "[XXX]")
344 self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})),
345 "[\xe9]")
Victor Stinner33798672016-03-01 21:59:58 +0100346 self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '123'})),
347 "x123")
348 self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '\xe9'})),
349 "x\xe9")
350
351 # test non-ASCII (don't take the fast-path)
Victor Stinner5a29f252014-04-05 00:17:51 +0200352 self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})),
353 "[<\xe9>]")
354 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})),
355 "[a]")
356 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})),
357 "[]")
Victor Stinner33798672016-03-01 21:59:58 +0100358 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': '123'})),
359 "[123]")
360 self.assertEqual("[a\xe9]".translate(str.maketrans({'a': '<\u20ac>'})),
361 "[<\u20ac>\xe9]")
Victor Stinner5a29f252014-04-05 00:17:51 +0200362
Victor Stinner4ff33af2014-04-05 11:56:37 +0200363 # invalid Unicode characters
364 invalid_char = 0x10ffff+1
365 for before in "a\xe9\u20ac\U0010ffff":
366 mapping = str.maketrans({before: invalid_char})
367 text = "[%s]" % before
368 self.assertRaises(ValueError, text.translate, mapping)
369
370 # errors
Georg Brandlceee0772007-11-27 23:48:05 +0000371 self.assertRaises(TypeError, self.type2test.maketrans)
372 self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
373 self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
374 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
375 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
376 self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
377 self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000378
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000379 self.assertRaises(TypeError, 'hello'.translate)
Walter Dörwald67e83882007-05-05 12:26:27 +0000380 self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000381
Walter Dörwald28256f22003-01-19 16:59:20 +0000382 def test_split(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000383 string_tests.CommonTest.test_split(self)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000384
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200385 # test mixed kinds
386 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
387 left *= 9
388 right *= 9
389 for delim in ('c', '\u0102', '\U00010302'):
390 self.checkequal([left + right],
391 left + right, 'split', delim)
392 self.checkequal([left, right],
393 left + delim + right, 'split', delim)
394 self.checkequal([left + right],
395 left + right, 'split', delim * 2)
396 self.checkequal([left, right],
397 left + delim * 2 + right, 'split', delim *2)
398
399 def test_rsplit(self):
400 string_tests.CommonTest.test_rsplit(self)
401 # test mixed kinds
402 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
403 left *= 9
404 right *= 9
405 for delim in ('c', '\u0102', '\U00010302'):
406 self.checkequal([left + right],
407 left + right, 'rsplit', delim)
408 self.checkequal([left, right],
409 left + delim + right, 'rsplit', delim)
410 self.checkequal([left + right],
411 left + right, 'rsplit', delim * 2)
412 self.checkequal([left, right],
413 left + delim * 2 + right, 'rsplit', delim *2)
414
415 def test_partition(self):
416 string_tests.MixinStrUnicodeUserStringTest.test_partition(self)
417 # test mixed kinds
Serhiy Storchaka48070c12015-03-29 19:21:02 +0300418 self.checkequal(('ABCDEFGH', '', ''), 'ABCDEFGH', 'partition', '\u4200')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200419 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
420 left *= 9
421 right *= 9
422 for delim in ('c', '\u0102', '\U00010302'):
423 self.checkequal((left + right, '', ''),
424 left + right, 'partition', delim)
425 self.checkequal((left, delim, right),
426 left + delim + right, 'partition', delim)
427 self.checkequal((left + right, '', ''),
428 left + right, 'partition', delim * 2)
429 self.checkequal((left, delim * 2, right),
430 left + delim * 2 + right, 'partition', delim * 2)
431
432 def test_rpartition(self):
433 string_tests.MixinStrUnicodeUserStringTest.test_rpartition(self)
434 # test mixed kinds
Serhiy Storchaka48070c12015-03-29 19:21:02 +0300435 self.checkequal(('', '', 'ABCDEFGH'), 'ABCDEFGH', 'rpartition', '\u4200')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200436 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
437 left *= 9
438 right *= 9
439 for delim in ('c', '\u0102', '\U00010302'):
440 self.checkequal(('', '', left + right),
441 left + right, 'rpartition', delim)
442 self.checkequal((left, delim, right),
443 left + delim + right, 'rpartition', delim)
444 self.checkequal(('', '', left + right),
445 left + right, 'rpartition', delim * 2)
446 self.checkequal((left, delim * 2, right),
447 left + delim * 2 + right, 'rpartition', delim * 2)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000448
Walter Dörwald28256f22003-01-19 16:59:20 +0000449 def test_join(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000450 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000451
Guido van Rossumf1044292007-09-27 18:01:22 +0000452 class MyWrapper:
453 def __init__(self, sval): self.sval = sval
454 def __str__(self): return self.sval
455
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000456 # mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000457 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
458 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
459 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
460 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
461 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
462 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
463 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
Guido van Rossum98297ee2007-11-06 21:34:58 +0000464 self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
465 self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
466 self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
467 self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
Marc-André Lemburge5034372000-08-08 08:04:29 +0000468
Martin Panterb71c0952017-01-12 11:54:59 +0000469 @unittest.skipIf(sys.maxsize > 2**32,
470 'needs too much memory on a 64-bit platform')
471 def test_join_overflow(self):
472 size = int(sys.maxsize**0.5) + 1
473 seq = ('A' * size,) * size
474 self.assertRaises(OverflowError, ''.join, seq)
475
Walter Dörwald28256f22003-01-19 16:59:20 +0000476 def test_replace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000477 string_tests.CommonTest.test_replace(self)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000478
Walter Dörwald28256f22003-01-19 16:59:20 +0000479 # method call forwarded from str implementation because of unicode argument
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000480 self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
481 self.assertRaises(TypeError, 'replace'.replace, "r", 42)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200482 # test mixed kinds
483 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
484 left *= 9
485 right *= 9
486 for delim in ('c', '\u0102', '\U00010302'):
487 for repl in ('d', '\u0103', '\U00010303'):
488 self.checkequal(left + right,
489 left + right, 'replace', delim, repl)
490 self.checkequal(left + repl + right,
491 left + delim + right,
492 'replace', delim, repl)
493 self.checkequal(left + right,
494 left + right, 'replace', delim * 2, repl)
495 self.checkequal(left + repl + right,
496 left + delim * 2 + right,
497 'replace', delim * 2, repl)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000498
Victor Stinner59de0ee2011-10-07 10:01:28 +0200499 @support.cpython_only
500 def test_replace_id(self):
Victor Stinner1d972ad2011-10-07 13:31:46 +0200501 pattern = 'abc'
502 text = 'abc def'
503 self.assertIs(text.replace(pattern, pattern), text)
Victor Stinner59de0ee2011-10-07 10:01:28 +0200504
Guido van Rossum98297ee2007-11-06 21:34:58 +0000505 def test_bytes_comparison(self):
Brett Cannon226b2302010-03-20 22:22:22 +0000506 with support.check_warnings():
507 warnings.simplefilter('ignore', BytesWarning)
508 self.assertEqual('abc' == b'abc', False)
509 self.assertEqual('abc' != b'abc', True)
510 self.assertEqual('abc' == bytearray(b'abc'), False)
511 self.assertEqual('abc' != bytearray(b'abc'), True)
Brett Cannon40430012007-10-22 20:24:51 +0000512
Walter Dörwald28256f22003-01-19 16:59:20 +0000513 def test_comparison(self):
514 # Comparisons:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000515 self.assertEqual('abc', 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000516 self.assertTrue('abcd' > 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000517 self.assertTrue('abc' < 'abcd')
Walter Dörwald28256f22003-01-19 16:59:20 +0000518
519 if 0:
520 # Move these tests to a Unicode collation module test...
521 # Testing UTF-16 code point order comparisons...
522
523 # No surrogates, no fixup required.
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000524 self.assertTrue('\u0061' < '\u20ac')
Walter Dörwald28256f22003-01-19 16:59:20 +0000525 # Non surrogate below surrogate value, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000526 self.assertTrue('\u0061' < '\ud800\udc02')
Walter Dörwald28256f22003-01-19 16:59:20 +0000527
528 # Non surrogate above surrogate value, fixup required
529 def test_lecmp(s, s2):
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000530 self.assertTrue(s < s2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000531
532 def test_fixup(s):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000533 s2 = '\ud800\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000534 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000535 s2 = '\ud900\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000536 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000537 s2 = '\uda00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000538 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000539 s2 = '\udb00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000540 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000541 s2 = '\ud800\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000542 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000543 s2 = '\ud900\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000544 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000545 s2 = '\uda00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000546 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000547 s2 = '\udb00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000548 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000549 s2 = '\ud800\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000550 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000551 s2 = '\ud900\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000552 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000553 s2 = '\uda00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000554 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000555 s2 = '\udb00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000556 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000557 s2 = '\ud800\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000558 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000559 s2 = '\ud900\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000560 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000561 s2 = '\uda00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000562 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000563 s2 = '\udb00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000564 test_lecmp(s, s2)
565
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000566 test_fixup('\ue000')
567 test_fixup('\uff61')
Walter Dörwald28256f22003-01-19 16:59:20 +0000568
569 # Surrogates on both sides, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000570 self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
Walter Dörwald28256f22003-01-19 16:59:20 +0000571
Walter Dörwald28256f22003-01-19 16:59:20 +0000572 def test_islower(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000573 super().test_islower()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000574 self.checkequalnofix(False, '\u1FFc', 'islower')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500575 self.assertFalse('\u2167'.islower())
576 self.assertTrue('\u2177'.islower())
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300577 # non-BMP, uppercase
578 self.assertFalse('\U00010401'.islower())
579 self.assertFalse('\U00010427'.islower())
580 # non-BMP, lowercase
581 self.assertTrue('\U00010429'.islower())
582 self.assertTrue('\U0001044E'.islower())
583 # non-BMP, non-cased
584 self.assertFalse('\U0001F40D'.islower())
585 self.assertFalse('\U0001F46F'.islower())
Walter Dörwald28256f22003-01-19 16:59:20 +0000586
587 def test_isupper(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000588 super().test_isupper()
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000589 if not sys.platform.startswith('java'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000590 self.checkequalnofix(False, '\u1FFc', 'isupper')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500591 self.assertTrue('\u2167'.isupper())
592 self.assertFalse('\u2177'.isupper())
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300593 # non-BMP, uppercase
594 self.assertTrue('\U00010401'.isupper())
595 self.assertTrue('\U00010427'.isupper())
596 # non-BMP, lowercase
597 self.assertFalse('\U00010429'.isupper())
598 self.assertFalse('\U0001044E'.isupper())
599 # non-BMP, non-cased
600 self.assertFalse('\U0001F40D'.isupper())
601 self.assertFalse('\U0001F46F'.isupper())
Walter Dörwald28256f22003-01-19 16:59:20 +0000602
603 def test_istitle(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000604 super().test_istitle()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000605 self.checkequalnofix(True, '\u1FFc', 'istitle')
606 self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
Walter Dörwald28256f22003-01-19 16:59:20 +0000607
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300608 # non-BMP, uppercase + lowercase
609 self.assertTrue('\U00010401\U00010429'.istitle())
610 self.assertTrue('\U00010427\U0001044E'.istitle())
611 # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
612 for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
613 self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
614
Walter Dörwald28256f22003-01-19 16:59:20 +0000615 def test_isspace(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000616 super().test_isspace()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000617 self.checkequalnofix(True, '\u2000', 'isspace')
618 self.checkequalnofix(True, '\u200a', 'isspace')
619 self.checkequalnofix(False, '\u2014', 'isspace')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300620 # apparently there are no non-BMP spaces chars in Unicode 6
621 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
622 '\U0001F40D', '\U0001F46F']:
623 self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
624
625 def test_isalnum(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000626 super().test_isalnum()
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300627 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
628 '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
629 self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
Walter Dörwald28256f22003-01-19 16:59:20 +0000630
631 def test_isalpha(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000632 super().test_isalpha()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000633 self.checkequalnofix(True, '\u1FFc', 'isalpha')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300634 # non-BMP, cased
635 self.assertTrue('\U00010401'.isalpha())
636 self.assertTrue('\U00010427'.isalpha())
637 self.assertTrue('\U00010429'.isalpha())
638 self.assertTrue('\U0001044E'.isalpha())
639 # non-BMP, non-cased
640 self.assertFalse('\U0001F40D'.isalpha())
641 self.assertFalse('\U0001F46F'.isalpha())
Walter Dörwald28256f22003-01-19 16:59:20 +0000642
INADA Naokia49ac992018-01-27 14:06:21 +0900643 def test_isascii(self):
644 super().test_isascii()
645 self.assertFalse("\u20ac".isascii())
646 self.assertFalse("\U0010ffff".isascii())
647
Walter Dörwald28256f22003-01-19 16:59:20 +0000648 def test_isdecimal(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000649 self.checkequalnofix(False, '', 'isdecimal')
650 self.checkequalnofix(False, 'a', 'isdecimal')
651 self.checkequalnofix(True, '0', 'isdecimal')
652 self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
653 self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
654 self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
655 self.checkequalnofix(True, '0123456789', 'isdecimal')
656 self.checkequalnofix(False, '0123456789a', 'isdecimal')
Walter Dörwald28256f22003-01-19 16:59:20 +0000657
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000658 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000659
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300660 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
661 '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
662 self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
663 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
664 self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
665
Walter Dörwald28256f22003-01-19 16:59:20 +0000666 def test_isdigit(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000667 super().test_isdigit()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000668 self.checkequalnofix(True, '\u2460', 'isdigit')
669 self.checkequalnofix(False, '\xbc', 'isdigit')
670 self.checkequalnofix(True, '\u0660', 'isdigit')
Walter Dörwald28256f22003-01-19 16:59:20 +0000671
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300672 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
673 '\U0001F40D', '\U0001F46F', '\U00011065']:
674 self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
675 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
676 self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
677
Walter Dörwald28256f22003-01-19 16:59:20 +0000678 def test_isnumeric(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000679 self.checkequalnofix(False, '', 'isnumeric')
680 self.checkequalnofix(False, 'a', 'isnumeric')
681 self.checkequalnofix(True, '0', 'isnumeric')
682 self.checkequalnofix(True, '\u2460', 'isnumeric')
683 self.checkequalnofix(True, '\xbc', 'isnumeric')
684 self.checkequalnofix(True, '\u0660', 'isnumeric')
685 self.checkequalnofix(True, '0123456789', 'isnumeric')
686 self.checkequalnofix(False, '0123456789a', 'isnumeric')
Walter Dörwald28256f22003-01-19 16:59:20 +0000687
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000688 self.assertRaises(TypeError, "abc".isnumeric, 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000689
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300690 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
691 '\U0001F40D', '\U0001F46F']:
692 self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
693 for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
694 '\U000104A0', '\U0001F107']:
695 self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
696
Martin v. Löwis47383402007-08-15 07:32:56 +0000697 def test_isidentifier(self):
698 self.assertTrue("a".isidentifier())
699 self.assertTrue("Z".isidentifier())
700 self.assertTrue("_".isidentifier())
701 self.assertTrue("b0".isidentifier())
702 self.assertTrue("bc".isidentifier())
703 self.assertTrue("b_".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000704 self.assertTrue("µ".isidentifier())
Benjamin Petersonf413b802011-08-12 22:17:18 -0500705 self.assertTrue("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000706
707 self.assertFalse(" ".isidentifier())
708 self.assertFalse("[".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000709 self.assertFalse("©".isidentifier())
Georg Brandld52429f2008-07-04 15:55:02 +0000710 self.assertFalse("0".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000711
Georg Brandl559e5d72008-06-11 18:37:52 +0000712 def test_isprintable(self):
713 self.assertTrue("".isprintable())
Benjamin Peterson09832742009-03-26 17:15:46 +0000714 self.assertTrue(" ".isprintable())
Georg Brandl559e5d72008-06-11 18:37:52 +0000715 self.assertTrue("abcdefg".isprintable())
716 self.assertFalse("abcdefg\n".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000717 # some defined Unicode character
718 self.assertTrue("\u0374".isprintable())
719 # undefined character
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +0000720 self.assertFalse("\u0378".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000721 # single surrogate character
Georg Brandl559e5d72008-06-11 18:37:52 +0000722 self.assertFalse("\ud800".isprintable())
723
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300724 self.assertTrue('\U0001F46F'.isprintable())
725 self.assertFalse('\U000E0020'.isprintable())
726
727 def test_surrogates(self):
728 for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
729 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
730 self.assertTrue(s.islower())
731 self.assertFalse(s.isupper())
732 self.assertFalse(s.istitle())
733 for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
734 'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
735 self.assertFalse(s.islower())
736 self.assertTrue(s.isupper())
737 self.assertTrue(s.istitle())
738
739 for meth_name in ('islower', 'isupper', 'istitle'):
740 meth = getattr(str, meth_name)
741 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
742 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
743
744 for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
745 'isdecimal', 'isnumeric',
746 'isidentifier', 'isprintable'):
747 meth = getattr(str, meth_name)
748 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
749 'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
750 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
751 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
752
753
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300754 def test_lower(self):
755 string_tests.CommonTest.test_lower(self)
756 self.assertEqual('\U00010427'.lower(), '\U0001044F')
757 self.assertEqual('\U00010427\U00010427'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300758 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300759 self.assertEqual('\U00010427\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300760 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300761 self.assertEqual('X\U00010427x\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300762 'x\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500763 self.assertEqual('fi'.lower(), 'fi')
764 self.assertEqual('\u0130'.lower(), '\u0069\u0307')
765 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
766 self.assertEqual('\u03a3'.lower(), '\u03c3')
767 self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
768 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
769 self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
770 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
771 self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
772 self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
773 self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
774 self.assertEqual('\u2177'.lower(), '\u2177')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300775
Benjamin Petersond5890c82012-01-14 13:23:30 -0500776 def test_casefold(self):
777 self.assertEqual('hello'.casefold(), 'hello')
778 self.assertEqual('hELlo'.casefold(), 'hello')
779 self.assertEqual('ß'.casefold(), 'ss')
780 self.assertEqual('fi'.casefold(), 'fi')
781 self.assertEqual('\u03a3'.casefold(), '\u03c3')
782 self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
Benjamin Peterson4eda9372012-08-05 15:05:34 -0700783 self.assertEqual('\u00b5'.casefold(), '\u03bc')
Benjamin Petersond5890c82012-01-14 13:23:30 -0500784
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300785 def test_upper(self):
786 string_tests.CommonTest.test_upper(self)
787 self.assertEqual('\U0001044F'.upper(), '\U00010427')
788 self.assertEqual('\U0001044F\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300789 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300790 self.assertEqual('\U00010427\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300791 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300792 self.assertEqual('X\U00010427x\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300793 'X\U00010427X\U00010427')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500794 self.assertEqual('fi'.upper(), 'FI')
795 self.assertEqual('\u0130'.upper(), '\u0130')
796 self.assertEqual('\u03a3'.upper(), '\u03a3')
797 self.assertEqual('ß'.upper(), 'SS')
798 self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
799 self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
800 self.assertEqual('\u2177'.upper(), '\u2167')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300801
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300802 def test_capitalize(self):
803 string_tests.CommonTest.test_capitalize(self)
804 self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
805 self.assertEqual('\U0001044F\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300806 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300807 self.assertEqual('\U00010427\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300808 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300809 self.assertEqual('\U0001044F\U00010427'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300810 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300811 self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300812 'X\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500813 self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
814 exp = '\u0399\u0308\u0300\u0069\u0307'
815 self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
Kingsley Mb015fc82019-04-12 16:35:39 +0100816 self.assertEqual('finnish'.capitalize(), 'Finnish')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500817 self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300818
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300819 def test_title(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000820 super().test_title()
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300821 self.assertEqual('\U0001044F'.title(), '\U00010427')
822 self.assertEqual('\U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300823 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300824 self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300825 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300826 self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300827 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300828 self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300829 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300830 self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300831 'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500832 self.assertEqual('fiNNISH'.title(), 'Finnish')
833 self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
834 self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300835
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300836 def test_swapcase(self):
837 string_tests.CommonTest.test_swapcase(self)
838 self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
839 self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
840 self.assertEqual('\U0001044F\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300841 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300842 self.assertEqual('\U00010427\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300843 '\U0001044F\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300844 self.assertEqual('\U0001044F\U00010427'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300845 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300846 self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300847 'x\U0001044FX\U00010427')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500848 self.assertEqual('fi'.swapcase(), 'FI')
849 self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
850 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
851 self.assertEqual('\u03a3'.swapcase(), '\u03c3')
852 self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
853 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
854 self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
855 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
856 self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
857 self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
858 self.assertEqual('\u03a3'.swapcase(), '\u03c3')
859 self.assertEqual('ß'.swapcase(), 'SS')
860 self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300861
Ezio Melottif84e01d2013-07-08 17:48:29 +0200862 def test_center(self):
863 string_tests.CommonTest.test_center(self)
864 self.assertEqual('x'.center(2, '\U0010FFFF'),
865 'x\U0010FFFF')
866 self.assertEqual('x'.center(3, '\U0010FFFF'),
867 '\U0010FFFFx\U0010FFFF')
868 self.assertEqual('x'.center(4, '\U0010FFFF'),
869 '\U0010FFFFx\U0010FFFF\U0010FFFF')
870
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400871 @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system")
Benjamin Peterson4d856892014-10-15 13:39:46 -0400872 @support.cpython_only
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400873 def test_case_operation_overflow(self):
874 # Issue #22643
Serhiy Storchaka411dfd82015-11-07 16:54:48 +0200875 size = 2**32//12 + 1
876 try:
877 s = "ü" * size
878 except MemoryError:
879 self.skipTest('no enough memory (%.0f MiB required)' % (size / 2**20))
880 try:
881 self.assertRaises(OverflowError, s.upper)
882 finally:
883 del s
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400884
Walter Dörwald28256f22003-01-19 16:59:20 +0000885 def test_contains(self):
886 # Testing Unicode contains method
Benjamin Peterson577473f2010-01-19 00:09:57 +0000887 self.assertIn('a', 'abdb')
888 self.assertIn('a', 'bdab')
889 self.assertIn('a', 'bdaba')
890 self.assertIn('a', 'bdba')
891 self.assertNotIn('a', 'bdb')
892 self.assertIn('a', 'bdba')
893 self.assertIn('a', ('a',1,None))
894 self.assertIn('a', (1,None,'a'))
895 self.assertIn('a', ('a',1,None))
896 self.assertIn('a', (1,None,'a'))
897 self.assertNotIn('a', ('x',1,'y'))
898 self.assertNotIn('a', ('x',1,None))
899 self.assertNotIn('abcd', 'abcxxxx')
900 self.assertIn('ab', 'abcd')
901 self.assertIn('ab', 'abc')
902 self.assertIn('ab', (1,None,'ab'))
903 self.assertIn('', 'abc')
904 self.assertIn('', '')
905 self.assertIn('', 'abc')
906 self.assertNotIn('\0', 'abc')
907 self.assertIn('\0', '\0abc')
908 self.assertIn('\0', 'abc\0')
909 self.assertIn('a', '\0abc')
910 self.assertIn('asdf', 'asdf')
911 self.assertNotIn('asdf', 'asd')
912 self.assertNotIn('asdf', '')
Walter Dörwald28256f22003-01-19 16:59:20 +0000913
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000914 self.assertRaises(TypeError, "abc".__contains__)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200915 # test mixed kinds
916 for fill in ('a', '\u0100', '\U00010300'):
917 fill *= 9
918 for delim in ('c', '\u0102', '\U00010302'):
919 self.assertNotIn(delim, fill)
920 self.assertIn(delim, fill + delim)
921 self.assertNotIn(delim * 2, fill)
922 self.assertIn(delim * 2, fill + delim * 2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000923
Serhiy Storchaka31b1c8b2013-06-12 09:20:44 +0300924 def test_issue18183(self):
925 '\U00010000\U00100000'.lower()
926 '\U00010000\U00100000'.casefold()
927 '\U00010000\U00100000'.upper()
928 '\U00010000\U00100000'.capitalize()
929 '\U00010000\U00100000'.title()
930 '\U00010000\U00100000'.swapcase()
931 '\U00100000'.center(3, '\U00010000')
932 '\U00100000'.ljust(3, '\U00010000')
933 '\U00100000'.rjust(3, '\U00010000')
934
Eric Smith8c663262007-08-25 02:26:07 +0000935 def test_format(self):
936 self.assertEqual(''.format(), '')
937 self.assertEqual('a'.format(), 'a')
938 self.assertEqual('ab'.format(), 'ab')
939 self.assertEqual('a{{'.format(), 'a{')
940 self.assertEqual('a}}'.format(), 'a}')
941 self.assertEqual('{{b'.format(), '{b')
942 self.assertEqual('}}b'.format(), '}b')
943 self.assertEqual('a{{b'.format(), 'a{b')
944
945 # examples from the PEP:
946 import datetime
947 self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
948 self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
949 "My name is Fred")
950 self.assertEqual("My name is {0} :-{{}}".format('Fred'),
951 "My name is Fred :-{}")
952
953 d = datetime.date(2007, 8, 18)
954 self.assertEqual("The year is {0.year}".format(d),
955 "The year is 2007")
956
Eric Smith8c663262007-08-25 02:26:07 +0000957 # classes we'll use for testing
958 class C:
959 def __init__(self, x=100):
960 self._x = x
961 def __format__(self, spec):
962 return spec
963
964 class D:
965 def __init__(self, x):
966 self.x = x
967 def __format__(self, spec):
968 return str(self.x)
969
970 # class with __str__, but no __format__
971 class E:
972 def __init__(self, x):
973 self.x = x
974 def __str__(self):
975 return 'E(' + self.x + ')'
976
977 # class with __repr__, but no __format__ or __str__
978 class F:
979 def __init__(self, x):
980 self.x = x
981 def __repr__(self):
982 return 'F(' + self.x + ')'
983
984 # class with __format__ that forwards to string, for some format_spec's
985 class G:
986 def __init__(self, x):
987 self.x = x
988 def __str__(self):
989 return "string is " + self.x
990 def __format__(self, format_spec):
991 if format_spec == 'd':
992 return 'G(' + self.x + ')'
993 return object.__format__(self, format_spec)
994
Eric Smith739e2ad2007-08-27 19:07:22 +0000995 class I(datetime.date):
996 def __format__(self, format_spec):
997 return self.strftime(format_spec)
998
Eric Smith185e30c2007-08-30 22:23:08 +0000999 class J(int):
1000 def __format__(self, format_spec):
1001 return int.__format__(self * 2, format_spec)
1002
Guido van Rossum97c1adf2016-08-18 09:22:23 -07001003 class M:
1004 def __init__(self, x):
1005 self.x = x
1006 def __repr__(self):
1007 return 'M(' + self.x + ')'
1008 __str__ = None
1009
1010 class N:
1011 def __init__(self, x):
1012 self.x = x
1013 def __repr__(self):
1014 return 'N(' + self.x + ')'
1015 __format__ = None
Eric Smith8c663262007-08-25 02:26:07 +00001016
1017 self.assertEqual(''.format(), '')
1018 self.assertEqual('abc'.format(), 'abc')
1019 self.assertEqual('{0}'.format('abc'), 'abc')
1020 self.assertEqual('{0:}'.format('abc'), 'abc')
1021# self.assertEqual('{ 0 }'.format('abc'), 'abc')
1022 self.assertEqual('X{0}'.format('abc'), 'Xabc')
1023 self.assertEqual('{0}X'.format('abc'), 'abcX')
1024 self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
1025 self.assertEqual('{1}'.format(1, 'abc'), 'abc')
1026 self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
1027 self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
1028 self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
1029 self.assertEqual('{0}'.format(-15), '-15')
1030 self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
1031 self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
1032 self.assertEqual('{{'.format(), '{')
1033 self.assertEqual('}}'.format(), '}')
1034 self.assertEqual('{{}}'.format(), '{}')
1035 self.assertEqual('{{x}}'.format(), '{x}')
1036 self.assertEqual('{{{0}}}'.format(123), '{123}')
1037 self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
1038 self.assertEqual('}}{{'.format(), '}{')
1039 self.assertEqual('}}x{{'.format(), '}x{')
1040
Eric Smith7ade6482007-08-26 22:27:13 +00001041 # weird field names
1042 self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
1043 self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
Eric Smith4cb4e4e2007-09-03 08:40:29 +00001044 self.assertEqual("{0[ ]}".format({' ':3}), '3')
Eric Smith7ade6482007-08-26 22:27:13 +00001045
Eric Smith8c663262007-08-25 02:26:07 +00001046 self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
1047 self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
1048 self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
1049 self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
1050 self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
1051 self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
1052 self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
1053
Eric Smith8c663262007-08-25 02:26:07 +00001054 # strings
1055 self.assertEqual('{0:.3s}'.format('abc'), 'abc')
1056 self.assertEqual('{0:.3s}'.format('ab'), 'ab')
1057 self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
1058 self.assertEqual('{0:.0s}'.format('abcdef'), '')
1059 self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
1060 self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
1061 self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
1062 self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
1063 self.assertEqual('{0:x<0s}'.format('result'), 'result')
1064 self.assertEqual('{0:x<5s}'.format('result'), 'result')
1065 self.assertEqual('{0:x<6s}'.format('result'), 'result')
1066 self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
1067 self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
1068 self.assertEqual('{0: <7s}'.format('result'), 'result ')
1069 self.assertEqual('{0:<7s}'.format('result'), 'result ')
1070 self.assertEqual('{0:>7s}'.format('result'), ' result')
1071 self.assertEqual('{0:>8s}'.format('result'), ' result')
1072 self.assertEqual('{0:^8s}'.format('result'), ' result ')
1073 self.assertEqual('{0:^9s}'.format('result'), ' result ')
1074 self.assertEqual('{0:^10s}'.format('result'), ' result ')
1075 self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
1076 self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
1077 self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
1078
Eric V. Smith2ea97122014-04-14 11:55:10 -04001079 # issue 12546: use \x00 as a fill character
1080 self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
1081 self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
1082 self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
1083 self.assertEqual('{0:^6s}'.format('foo'), ' foo ')
1084
1085 self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
1086 self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
1087 self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
1088 self.assertEqual('{0:<6}'.format(3), '3 ')
1089
1090 self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
1091 self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
1092 self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
1093 self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
1094
1095 self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
1096 self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
1097 self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
1098 self.assertEqual('{0:^12}'.format(3+2.0j), ' (3+2j) ')
1099
Eric Smith8c663262007-08-25 02:26:07 +00001100 # format specifiers for user defined type
1101 self.assertEqual('{0:abc}'.format(C()), 'abc')
1102
Georg Brandld52429f2008-07-04 15:55:02 +00001103 # !r, !s and !a coercions
Eric Smith8c663262007-08-25 02:26:07 +00001104 self.assertEqual('{0!s}'.format('Hello'), 'Hello')
1105 self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
1106 self.assertEqual('{0!s:15}'.format('Hello'), 'Hello ')
1107 self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello ')
1108 self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
1109 self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
1110 self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +00001111 self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +00001112 self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'") # printable
1113 self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
Georg Brandl559e5d72008-06-11 18:37:52 +00001114 self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +00001115 self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +00001116 self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
Georg Brandl559e5d72008-06-11 18:37:52 +00001117 self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
1118 self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
Georg Brandld52429f2008-07-04 15:55:02 +00001119 self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
Eric Smith8c663262007-08-25 02:26:07 +00001120
Eric Smith8c663262007-08-25 02:26:07 +00001121 # test fallback to object.__format__
1122 self.assertEqual('{0}'.format({}), '{}')
1123 self.assertEqual('{0}'.format([]), '[]')
1124 self.assertEqual('{0}'.format([1]), '[1]')
Eric Smithe4d63172010-09-13 20:48:43 +00001125
Eric Smith8c663262007-08-25 02:26:07 +00001126 self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
Eric Smith8c663262007-08-25 02:26:07 +00001127 self.assertEqual('{0!s}'.format(G('data')), 'string is data')
1128
Andrew Svetlov2cd8ce42012-12-23 14:27:17 +02001129 self.assertRaises(TypeError, '{0:^10}'.format, E('data'))
1130 self.assertRaises(TypeError, '{0:^10s}'.format, E('data'))
1131 self.assertRaises(TypeError, '{0:>15s}'.format, G('data'))
Eric Smithe4d63172010-09-13 20:48:43 +00001132
Eric Smith739e2ad2007-08-27 19:07:22 +00001133 self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
1134 month=8,
1135 day=27)),
1136 "date: 2007-08-27")
1137
Eric Smith185e30c2007-08-30 22:23:08 +00001138 # test deriving from a builtin type and overriding __format__
1139 self.assertEqual("{0}".format(J(10)), "20")
1140
1141
Eric Smith8c663262007-08-25 02:26:07 +00001142 # string format specifiers
1143 self.assertEqual('{0:}'.format('a'), 'a')
1144
1145 # computed format specifiers
1146 self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
1147 self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
1148 self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
1149 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello ')
1150 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello ')
1151
1152 # test various errors
1153 self.assertRaises(ValueError, '{'.format)
1154 self.assertRaises(ValueError, '}'.format)
1155 self.assertRaises(ValueError, 'a{'.format)
1156 self.assertRaises(ValueError, 'a}'.format)
1157 self.assertRaises(ValueError, '{a'.format)
1158 self.assertRaises(ValueError, '}a'.format)
Eric Smith11529192007-09-04 23:04:22 +00001159 self.assertRaises(IndexError, '{0}'.format)
1160 self.assertRaises(IndexError, '{1}'.format, 'abc')
1161 self.assertRaises(KeyError, '{x}'.format)
Eric Smith8c663262007-08-25 02:26:07 +00001162 self.assertRaises(ValueError, "}{".format)
Eric Smith8c663262007-08-25 02:26:07 +00001163 self.assertRaises(ValueError, "abc{0:{}".format)
1164 self.assertRaises(ValueError, "{0".format)
Eric Smith11529192007-09-04 23:04:22 +00001165 self.assertRaises(IndexError, "{0.}".format)
1166 self.assertRaises(ValueError, "{0.}".format, 0)
Benjamin Peterson4d944742013-05-17 18:22:31 -05001167 self.assertRaises(ValueError, "{0[}".format)
Eric Smith4cb4e4e2007-09-03 08:40:29 +00001168 self.assertRaises(ValueError, "{0[}".format, [])
Eric Smith11529192007-09-04 23:04:22 +00001169 self.assertRaises(KeyError, "{0]}".format)
1170 self.assertRaises(ValueError, "{0.[]}".format, 0)
Eric Smith7ade6482007-08-26 22:27:13 +00001171 self.assertRaises(ValueError, "{0..foo}".format, 0)
Eric Smith11529192007-09-04 23:04:22 +00001172 self.assertRaises(ValueError, "{0[0}".format, 0)
1173 self.assertRaises(ValueError, "{0[0:foo}".format, 0)
1174 self.assertRaises(KeyError, "{c]}".format)
1175 self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
1176 self.assertRaises(ValueError, "{0}}".format, 0)
1177 self.assertRaises(KeyError, "{foo}".format, bar=3)
Eric Smith8c663262007-08-25 02:26:07 +00001178 self.assertRaises(ValueError, "{0!x}".format, 3)
Eric Smith11529192007-09-04 23:04:22 +00001179 self.assertRaises(ValueError, "{0!}".format, 0)
1180 self.assertRaises(ValueError, "{0!rs}".format, 0)
Eric Smith8c663262007-08-25 02:26:07 +00001181 self.assertRaises(ValueError, "{!}".format)
Eric Smith8ec90442009-03-14 12:29:34 +00001182 self.assertRaises(IndexError, "{:}".format)
1183 self.assertRaises(IndexError, "{:s}".format)
1184 self.assertRaises(IndexError, "{}".format)
Benjamin Peterson59a1b2f2010-06-07 22:31:26 +00001185 big = "23098475029384702983476098230754973209482573"
1186 self.assertRaises(ValueError, ("{" + big + "}").format)
1187 self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
Eric Smith8c663262007-08-25 02:26:07 +00001188
Eric Smith41669ca2009-05-23 14:23:22 +00001189 # issue 6089
1190 self.assertRaises(ValueError, "{0[0]x}".format, [None])
1191 self.assertRaises(ValueError, "{0[0](10)}".format, [None])
1192
Eric Smith8c663262007-08-25 02:26:07 +00001193 # can't have a replacement on the field name portion
1194 self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
1195
1196 # exceed maximum recursion depth
1197 self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
1198 self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1199 0, 1, 2, 3, 4, 5, 6, 7)
1200
1201 # string format spec errors
1202 self.assertRaises(ValueError, "{0:-s}".format, '')
1203 self.assertRaises(ValueError, format, "", "-")
1204 self.assertRaises(ValueError, "{0:=s}".format, '')
1205
Eric Smithb1ebcc62008-07-15 13:02:41 +00001206 # Alternate formatting is not supported
1207 self.assertRaises(ValueError, format, '', '#')
1208 self.assertRaises(ValueError, format, '', '#20')
1209
Victor Stinnerece58de2012-04-23 23:36:38 +02001210 # Non-ASCII
1211 self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"),
1212 'ABC\u0410\u0411\u0412')
1213 self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"),
1214 'ABC')
1215 self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"),
1216 '')
1217
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001218 self.assertEqual("{[{}]}".format({"{}": 5}), "5")
Benjamin Peterson4d944742013-05-17 18:22:31 -05001219 self.assertEqual("{[{}]}".format({"{}" : "a"}), "a")
1220 self.assertEqual("{[{]}".format({"{" : "a"}), "a")
1221 self.assertEqual("{[}]}".format({"}" : "a"}), "a")
1222 self.assertEqual("{[[]}".format({"[" : "a"}), "a")
1223 self.assertEqual("{[!]}".format({"!" : "a"}), "a")
1224 self.assertRaises(ValueError, "{a{}b}".format, 42)
1225 self.assertRaises(ValueError, "{a{b}".format, 42)
1226 self.assertRaises(ValueError, "{[}".format, 42)
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001227
Benjamin Peterson0ee22bf2013-11-26 19:22:36 -06001228 self.assertEqual("0x{:0{:d}X}".format(0x0,16), "0x0000000000000000")
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001229
Guido van Rossum97c1adf2016-08-18 09:22:23 -07001230 # Blocking fallback
1231 m = M('data')
1232 self.assertEqual("{!r}".format(m), 'M(data)')
1233 self.assertRaises(TypeError, "{!s}".format, m)
1234 self.assertRaises(TypeError, "{}".format, m)
1235 n = N('data')
1236 self.assertEqual("{!r}".format(n), 'N(data)')
1237 self.assertEqual("{!s}".format(n), 'N(data)')
1238 self.assertRaises(TypeError, "{}".format, n)
1239
Eric Smith27bbca62010-11-04 17:06:58 +00001240 def test_format_map(self):
1241 self.assertEqual(''.format_map({}), '')
1242 self.assertEqual('a'.format_map({}), 'a')
1243 self.assertEqual('ab'.format_map({}), 'ab')
1244 self.assertEqual('a{{'.format_map({}), 'a{')
1245 self.assertEqual('a}}'.format_map({}), 'a}')
1246 self.assertEqual('{{b'.format_map({}), '{b')
1247 self.assertEqual('}}b'.format_map({}), '}b')
1248 self.assertEqual('a{{b'.format_map({}), 'a{b')
1249
1250 # using mappings
1251 class Mapping(dict):
1252 def __missing__(self, key):
1253 return key
1254 self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
1255 self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
1256
1257 class InternalMapping:
1258 def __init__(self):
1259 self.mapping = {'a': 'hello'}
1260 def __getitem__(self, key):
1261 return self.mapping[key]
1262 self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
1263
1264
Eric Smith27bbca62010-11-04 17:06:58 +00001265 class C:
1266 def __init__(self, x=100):
1267 self._x = x
1268 def __format__(self, spec):
1269 return spec
Eric Smith27bbca62010-11-04 17:06:58 +00001270 self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
1271
1272 # test various errors
Eric V. Smithedbb6ca2012-03-12 15:16:22 -07001273 self.assertRaises(TypeError, ''.format_map)
1274 self.assertRaises(TypeError, 'a'.format_map)
1275
1276 self.assertRaises(ValueError, '{'.format_map, {})
1277 self.assertRaises(ValueError, '}'.format_map, {})
1278 self.assertRaises(ValueError, 'a{'.format_map, {})
1279 self.assertRaises(ValueError, 'a}'.format_map, {})
1280 self.assertRaises(ValueError, '{a'.format_map, {})
1281 self.assertRaises(ValueError, '}a'.format_map, {})
Eric Smith27bbca62010-11-04 17:06:58 +00001282
Eric V. Smith12ebefc2011-07-18 14:03:41 -04001283 # issue #12579: can't supply positional params to format_map
1284 self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
1285 self.assertRaises(ValueError, '{}'.format_map, 'a')
1286 self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
1287
Serhiy Storchaka50754162017-08-03 11:45:23 +03001288 class BadMapping:
1289 def __getitem__(self, key):
1290 return 1/0
1291 self.assertRaises(KeyError, '{a}'.format_map, {})
1292 self.assertRaises(TypeError, '{a}'.format_map, [])
1293 self.assertRaises(ZeroDivisionError, '{a}'.format_map, BadMapping())
1294
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001295 def test_format_huge_precision(self):
1296 format_string = ".{}f".format(sys.maxsize + 1)
1297 with self.assertRaises(ValueError):
1298 result = format(2.34, format_string)
1299
1300 def test_format_huge_width(self):
1301 format_string = "{}f".format(sys.maxsize + 1)
1302 with self.assertRaises(ValueError):
1303 result = format(2.34, format_string)
1304
1305 def test_format_huge_item_number(self):
1306 format_string = "{{{}:.6f}}".format(sys.maxsize + 1)
1307 with self.assertRaises(ValueError):
1308 result = format_string.format(2.34)
1309
Eric Smith8ec90442009-03-14 12:29:34 +00001310 def test_format_auto_numbering(self):
1311 class C:
1312 def __init__(self, x=100):
1313 self._x = x
1314 def __format__(self, spec):
1315 return spec
1316
1317 self.assertEqual('{}'.format(10), '10')
1318 self.assertEqual('{:5}'.format('s'), 's ')
1319 self.assertEqual('{!r}'.format('s'), "'s'")
1320 self.assertEqual('{._x}'.format(C(10)), '10')
1321 self.assertEqual('{[1]}'.format([1, 2]), '2')
1322 self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
1323 self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
1324
1325 self.assertEqual('a{:{}}b'.format('x', '^10'), 'a x b')
1326 self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
1327
1328 # can't mix and match numbering and auto-numbering
1329 self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
1330 self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
1331 self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
1332 self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
1333
1334 # can mix and match auto-numbering and named
1335 self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
1336 self.assertEqual('{}{f}'.format(4, f='test'), '4test')
1337 self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
1338 self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
1339
Walter Dörwald28256f22003-01-19 16:59:20 +00001340 def test_formatting(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +00001341 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
Walter Dörwald28256f22003-01-19 16:59:20 +00001342 # Testing Unicode formatting strings...
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001343 self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
1344 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000, 3.00')
1345 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000, 3.00')
1346 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000, 3.50')
1347 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000, 3.57')
1348 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
Walter Dörwald28256f22003-01-19 16:59:20 +00001349 if not sys.platform.startswith('java'):
Walter Dörwald67e83882007-05-05 12:26:27 +00001350 self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
Georg Brandl559e5d72008-06-11 18:37:52 +00001351 self.assertEqual("%r" % ("\u1234",), "'\u1234'")
1352 self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001353 self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
1354 self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
Walter Dörwald56fbcb52003-03-31 18:18:41 +00001355
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001356 self.assertEqual('%c' % 0x1234, '\u1234')
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00001357 self.assertEqual('%c' % 0x21483, '\U00021483')
1358 self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
1359 self.assertEqual('%c' % '\U00021483', '\U00021483')
1360 self.assertRaises(TypeError, "%c".__mod__, "aa")
Stefan Krah99212f62010-07-19 17:58:26 +00001361 self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -07001362 self.assertRaises(TypeError, "%i".__mod__, "aa")
Walter Dörwald28256f22003-01-19 16:59:20 +00001363
1364 # formatting jobs delegated from the string implementation:
Walter Dörwald28256f22003-01-19 16:59:20 +00001365 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001366 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1367 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1368 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1369 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1370 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1371 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
1372 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
1373 self.assertEqual('...%s...' % "abc", '...abc...')
1374 self.assertEqual('%*s' % (5,'abc',), ' abc')
1375 self.assertEqual('%*s' % (-5,'abc',), 'abc ')
1376 self.assertEqual('%*.*s' % (5,2,'abc',), ' ab')
1377 self.assertEqual('%*.*s' % (5,3,'abc',), ' abc')
1378 self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10 abc')
1379 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103 abc')
1380 self.assertEqual('%c' % 'a', 'a')
Neil Schemenauercf52c072005-08-12 17:34:58 +00001381 class Wrapper:
1382 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001383 return '\u1234'
1384 self.assertEqual('%s' % Wrapper(), '\u1234')
Walter Dörwald28256f22003-01-19 16:59:20 +00001385
Eric Smith741191f2009-05-06 13:08:15 +00001386 # issue 3382
1387 NAN = float('nan')
1388 INF = float('inf')
1389 self.assertEqual('%f' % NAN, 'nan')
1390 self.assertEqual('%F' % NAN, 'NAN')
1391 self.assertEqual('%f' % INF, 'inf')
1392 self.assertEqual('%F' % INF, 'INF')
1393
Victor Stinnerf59c28c2012-05-09 03:24:14 +02001394 # PEP 393
1395 self.assertEqual('%.1s' % "a\xe9\u20ac", 'a')
1396 self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9')
1397
Ethan Furmandf3ed242014-01-05 06:50:30 -08001398 #issue 19995
Ethan Furman9ab74802014-03-21 06:38:46 -07001399 class PseudoInt:
Ethan Furmandf3ed242014-01-05 06:50:30 -08001400 def __init__(self, value):
1401 self.value = int(value)
1402 def __int__(self):
1403 return self.value
1404 def __index__(self):
1405 return self.value
Ethan Furman9ab74802014-03-21 06:38:46 -07001406 class PseudoFloat:
Ethan Furmandf3ed242014-01-05 06:50:30 -08001407 def __init__(self, value):
1408 self.value = float(value)
1409 def __int__(self):
1410 return int(self.value)
Ethan Furman9ab74802014-03-21 06:38:46 -07001411 pi = PseudoFloat(3.1415)
1412 letter_m = PseudoInt(109)
Antoine Pitroueb168042014-01-10 00:02:38 +01001413 self.assertEqual('%x' % 42, '2a')
1414 self.assertEqual('%X' % 15, 'F')
1415 self.assertEqual('%o' % 9, '11')
1416 self.assertEqual('%c' % 109, 'm')
1417 self.assertEqual('%x' % letter_m, '6d')
1418 self.assertEqual('%X' % letter_m, '6D')
1419 self.assertEqual('%o' % letter_m, '155')
1420 self.assertEqual('%c' % letter_m, 'm')
Ethan Furman9ab74802014-03-21 06:38:46 -07001421 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14),
1422 self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11),
1423 self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79),
1424 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi),
1425 self.assertRaises(TypeError, operator.mod, '%c', pi),
Ethan Furmandf3ed242014-01-05 06:50:30 -08001426
Ethan Furmanfb137212013-08-31 10:18:55 -07001427 def test_formatting_with_enum(self):
1428 # issue18780
1429 import enum
1430 class Float(float, enum.Enum):
1431 PI = 3.1415926
1432 class Int(enum.IntEnum):
1433 IDES = 15
1434 class Str(str, enum.Enum):
1435 ABC = 'abc'
1436 # Testing Unicode formatting strings...
Ethan Furman13bdfa72013-08-31 12:48:51 -07001437 self.assertEqual("%s, %s" % (Str.ABC, Str.ABC),
1438 'Str.ABC, Str.ABC')
1439 self.assertEqual("%s, %s, %d, %i, %u, %f, %5.2f" %
1440 (Str.ABC, Str.ABC,
1441 Int.IDES, Int.IDES, Int.IDES,
1442 Float.PI, Float.PI),
1443 'Str.ABC, Str.ABC, 15, 15, 15, 3.141593, 3.14')
Ethan Furmanfb137212013-08-31 10:18:55 -07001444
1445 # formatting jobs delegated from the string implementation:
Ethan Furman13bdfa72013-08-31 12:48:51 -07001446 self.assertEqual('...%(foo)s...' % {'foo':Str.ABC},
1447 '...Str.ABC...')
1448 self.assertEqual('...%(foo)s...' % {'foo':Int.IDES},
1449 '...Int.IDES...')
1450 self.assertEqual('...%(foo)i...' % {'foo':Int.IDES},
1451 '...15...')
1452 self.assertEqual('...%(foo)d...' % {'foo':Int.IDES},
1453 '...15...')
1454 self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI},
1455 '...15...')
1456 self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123},
1457 '...3.141593...')
Ethan Furmanfb137212013-08-31 10:18:55 -07001458
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001459 def test_formatting_huge_precision(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001460 format_string = "%.{}f".format(sys.maxsize + 1)
1461 with self.assertRaises(ValueError):
1462 result = format_string % 2.34
1463
Martijn Pietersd7e64332017-02-23 13:38:04 +00001464 def test_issue28598_strsubclass_rhs(self):
1465 # A subclass of str with an __rmod__ method should be able to hook
1466 # into the % operator
1467 class SubclassedStr(str):
1468 def __rmod__(self, other):
1469 return 'Success, self.__rmod__({!r}) was called'.format(other)
1470 self.assertEqual('lhs %% %r' % SubclassedStr('rhs'),
1471 "Success, self.__rmod__('lhs %% %r') was called")
1472
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001473 @support.cpython_only
1474 def test_formatting_huge_precision_c_limits(self):
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001475 from _testcapi import INT_MAX
1476 format_string = "%.{}f".format(INT_MAX + 1)
1477 with self.assertRaises(ValueError):
1478 result = format_string % 2.34
1479
1480 def test_formatting_huge_width(self):
1481 format_string = "%{}f".format(sys.maxsize + 1)
1482 with self.assertRaises(ValueError):
1483 result = format_string % 2.34
1484
Ezio Melottiba42fd52011-04-26 06:09:45 +03001485 def test_startswith_endswith_errors(self):
1486 for meth in ('foo'.startswith, 'foo'.endswith):
Ezio Melottif2b3f782011-04-26 06:40:59 +03001487 with self.assertRaises(TypeError) as cm:
Ezio Melottiba42fd52011-04-26 06:09:45 +03001488 meth(['f'])
Ezio Melottif2b3f782011-04-26 06:40:59 +03001489 exc = str(cm.exception)
Ezio Melottiba42fd52011-04-26 06:09:45 +03001490 self.assertIn('str', exc)
1491 self.assertIn('tuple', exc)
1492
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001493 @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
Georg Brandlda6b1072006-01-20 17:48:54 +00001494 def test_format_float(self):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001495 # should not format with a comma, but always with C locale
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001496 self.assertEqual('1.0', '%.1f' % 1.0)
Georg Brandlda6b1072006-01-20 17:48:54 +00001497
Walter Dörwald28256f22003-01-19 16:59:20 +00001498 def test_constructor(self):
1499 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1500
1501 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001502 str('unicode remains unicode'),
1503 'unicode remains unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001504 )
1505
Victor Stinner07ac3eb2011-10-01 16:16:43 +02001506 for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
Serhiy Storchaka15095802015-11-25 15:47:01 +02001507 subclass = StrSubclass(text)
Victor Stinner07ac3eb2011-10-01 16:16:43 +02001508 self.assertEqual(str(subclass), text)
1509 self.assertEqual(len(subclass), len(text))
1510 if text == 'ascii':
1511 self.assertEqual(subclass.encode('ascii'), b'ascii')
1512 self.assertEqual(subclass.encode('utf-8'), b'ascii')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001513
Walter Dörwald28256f22003-01-19 16:59:20 +00001514 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001515 str('strings are converted to unicode'),
1516 'strings are converted to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001517 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001518
Walter Dörwald28256f22003-01-19 16:59:20 +00001519 class StringCompat:
1520 def __init__(self, x):
1521 self.x = x
1522 def __str__(self):
1523 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001524
Walter Dörwald28256f22003-01-19 16:59:20 +00001525 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001526 str(StringCompat('__str__ compatible objects are recognized')),
1527 '__str__ compatible objects are recognized'
Walter Dörwald28256f22003-01-19 16:59:20 +00001528 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001529
Walter Dörwald28256f22003-01-19 16:59:20 +00001530 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001531
Walter Dörwald28256f22003-01-19 16:59:20 +00001532 o = StringCompat('unicode(obj) is compatible to str()')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001533 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Walter Dörwald28256f22003-01-19 16:59:20 +00001534 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001535
Guido van Rossume2a383d2007-01-15 16:59:06 +00001536 for obj in (123, 123.45, 123):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001537 self.assertEqual(str(obj), str(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001538
Walter Dörwald28256f22003-01-19 16:59:20 +00001539 # unicode(obj, encoding, error) tests (this maps to
1540 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001541
Walter Dörwald28256f22003-01-19 16:59:20 +00001542 if not sys.platform.startswith('java'):
1543 self.assertRaises(
1544 TypeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001545 str,
1546 'decoding unicode is not supported',
Walter Dörwald28256f22003-01-19 16:59:20 +00001547 'utf-8',
1548 'strict'
1549 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001550
Walter Dörwald28256f22003-01-19 16:59:20 +00001551 self.assertEqual(
Walter Dörwald67e83882007-05-05 12:26:27 +00001552 str(b'strings are decoded to unicode', 'utf-8', 'strict'),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001553 'strings are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001554 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001555
Walter Dörwald28256f22003-01-19 16:59:20 +00001556 if not sys.platform.startswith('java'):
1557 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001558 str(
Guido van Rossumbae07c92007-10-08 02:46:15 +00001559 memoryview(b'character buffers are decoded to unicode'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001560 'utf-8',
1561 'strict'
1562 ),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001563 'character buffers are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001564 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001565
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001566 self.assertRaises(TypeError, str, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001567
Chris Jerdonek5fae0e52012-11-20 17:45:51 -08001568 def test_constructor_keyword_args(self):
1569 """Pass various keyword argument combinations to the constructor."""
1570 # The object argument can be passed as a keyword.
1571 self.assertEqual(str(object='foo'), 'foo')
1572 self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo')
1573 # The errors argument without encoding triggers "decode" mode.
1574 self.assertEqual(str(b'foo', errors='strict'), 'foo') # not "b'foo'"
1575 self.assertEqual(str(object=b'foo', errors='strict'), 'foo')
1576
1577 def test_constructor_defaults(self):
1578 """Check the constructor argument defaults."""
1579 # The object argument defaults to '' or b''.
1580 self.assertEqual(str(), '')
1581 self.assertEqual(str(errors='strict'), '')
1582 utf8_cent = '¢'.encode('utf-8')
1583 # The encoding argument defaults to utf-8.
1584 self.assertEqual(str(utf8_cent, errors='strict'), '¢')
1585 # The errors argument defaults to strict.
1586 self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii')
1587
Walter Dörwald28256f22003-01-19 16:59:20 +00001588 def test_codecs_utf7(self):
1589 utfTests = [
Walter Dörwald67e83882007-05-05 12:26:27 +00001590 ('A\u2262\u0391.', b'A+ImIDkQ.'), # RFC2152 example
1591 ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'), # RFC2152 example
1592 ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'), # RFC2152 example
1593 ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1594 ('+', b'+-'),
1595 ('+-', b'+--'),
1596 ('+?', b'+-?'),
R David Murray44b548d2016-09-08 13:59:53 -04001597 (r'\?', b'+AFw?'),
Walter Dörwald67e83882007-05-05 12:26:27 +00001598 ('+?', b'+-?'),
1599 (r'\\?', b'+AFwAXA?'),
1600 (r'\\\?', b'+AFwAXABc?'),
Antoine Pitrou244651a2009-05-04 18:56:13 +00001601 (r'++--', b'+-+---'),
1602 ('\U000abcde', b'+2m/c3g-'), # surrogate pairs
1603 ('/', b'/'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001604 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001605
Walter Dörwald28256f22003-01-19 16:59:20 +00001606 for (x, y) in utfTests:
1607 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001608
Antoine Pitrou5418ee02011-11-15 01:42:21 +01001609 # Unpaired surrogates are passed through
1610 self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
1611 self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
1612 self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
1613 self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
1614 self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
1615 self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
1616 self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
1617 self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001618
Antoine Pitrou5418ee02011-11-15 01:42:21 +01001619 self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
1620 self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001621
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001622 # Issue #2242: crash on some Windows/MSVC versions
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001623 self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')
Antoine Pitrou244651a2009-05-04 18:56:13 +00001624
1625 # Direct encoded characters
1626 set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1627 # Optional direct characters
1628 set_o = '!"#$%&*;<=>@[]^_`{|}'
1629 for c in set_d:
1630 self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1631 self.assertEqual(c.encode('ascii').decode('utf7'), c)
1632 for c in set_o:
1633 self.assertEqual(c.encode('ascii').decode('utf7'), c)
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001634
Zackery Spytze349bf22018-08-18 22:43:38 -06001635 with self.assertRaisesRegex(UnicodeDecodeError,
1636 'ill-formed sequence'):
1637 b'+@'.decode('utf-7')
1638
Walter Dörwald28256f22003-01-19 16:59:20 +00001639 def test_codecs_utf8(self):
Walter Dörwald67e83882007-05-05 12:26:27 +00001640 self.assertEqual(''.encode('utf-8'), b'')
1641 self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
Ezio Melottia9860ae2011-10-04 19:06:00 +03001642 self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82')
1643 self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96')
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001644 self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1645 self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
Ezio Melottia9860ae2011-10-04 19:06:00 +03001646 self.assertEqual(('\U00010002'*10).encode('utf-8'),
1647 b'\xf0\x90\x80\x82'*10)
Walter Dörwald28256f22003-01-19 16:59:20 +00001648 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001649 '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1650 '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1651 '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1652 '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1653 '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1654 ' Nunstuck git und'.encode('utf-8'),
Walter Dörwald67e83882007-05-05 12:26:27 +00001655 b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1656 b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1657 b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1658 b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1659 b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1660 b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1661 b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1662 b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1663 b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1664 b'\xe3\x80\x8cWenn ist das Nunstuck git und'
Walter Dörwald28256f22003-01-19 16:59:20 +00001665 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001666
Walter Dörwald28256f22003-01-19 16:59:20 +00001667 # UTF-8 specific decoding tests
Walter Dörwald67e83882007-05-05 12:26:27 +00001668 self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1669 self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1670 self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001671
Walter Dörwald28256f22003-01-19 16:59:20 +00001672 # Other possible utf-8 test cases:
1673 # * strict decoding testing for all of the
1674 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001675
Ezio Melotti57221d02010-07-01 07:32:02 +00001676 def test_utf8_decode_valid_sequences(self):
1677 sequences = [
1678 # single byte
1679 (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1680 # 2 bytes
1681 (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1682 # 3 bytes
1683 (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1684 (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1685 # 4 bytes
1686 (b'\xF0\x90\x80\x80', '\U00010000'),
1687 (b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1688 ]
1689 for seq, res in sequences:
1690 self.assertEqual(seq.decode('utf-8'), res)
1691
1692
1693 def test_utf8_decode_invalid_sequences(self):
1694 # continuation bytes in a sequence of 2, 3, or 4 bytes
1695 continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001696 # start bytes of a 2-byte sequence equivalent to code points < 0x7F
Ezio Melotti57221d02010-07-01 07:32:02 +00001697 invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001698 # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
Ezio Melotti57221d02010-07-01 07:32:02 +00001699 invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1700 invalid_start_bytes = (
1701 continuation_bytes + invalid_2B_seq_start_bytes +
1702 invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1703 )
1704
1705 for byte in invalid_start_bytes:
1706 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1707
1708 for sb in invalid_2B_seq_start_bytes:
1709 for cb in continuation_bytes:
1710 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1711
1712 for sb in invalid_4B_seq_start_bytes:
1713 for cb1 in continuation_bytes[:3]:
1714 for cb3 in continuation_bytes[:3]:
1715 self.assertRaises(UnicodeDecodeError,
1716 (sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1717
1718 for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1719 self.assertRaises(UnicodeDecodeError,
1720 (b'\xE0'+cb+b'\x80').decode, 'utf-8')
1721 self.assertRaises(UnicodeDecodeError,
1722 (b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1723 # surrogates
1724 for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1725 self.assertRaises(UnicodeDecodeError,
1726 (b'\xED'+cb+b'\x80').decode, 'utf-8')
1727 self.assertRaises(UnicodeDecodeError,
1728 (b'\xED'+cb+b'\xBF').decode, 'utf-8')
1729 for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1730 self.assertRaises(UnicodeDecodeError,
1731 (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1732 self.assertRaises(UnicodeDecodeError,
1733 (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1734 for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1735 self.assertRaises(UnicodeDecodeError,
1736 (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1737 self.assertRaises(UnicodeDecodeError,
1738 (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1739
1740 def test_issue8271(self):
1741 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1742 # only the start byte and the continuation byte(s) are now considered
1743 # invalid, instead of the number of bytes specified by the start byte.
1744 # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1745 # table 3-8, Row 2) for more information about the algorithm used.
1746 FFFD = '\ufffd'
1747 sequences = [
1748 # invalid start bytes
1749 (b'\x80', FFFD), # continuation byte
1750 (b'\x80\x80', FFFD*2), # 2 continuation bytes
1751 (b'\xc0', FFFD),
1752 (b'\xc0\xc0', FFFD*2),
1753 (b'\xc1', FFFD),
1754 (b'\xc1\xc0', FFFD*2),
1755 (b'\xc0\xc1', FFFD*2),
1756 # with start byte of a 2-byte sequence
1757 (b'\xc2', FFFD), # only the start byte
1758 (b'\xc2\xc2', FFFD*2), # 2 start bytes
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001759 (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes
Ezio Melotti57221d02010-07-01 07:32:02 +00001760 (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1761 # with start byte of a 3-byte sequence
1762 (b'\xe1', FFFD), # only the start byte
1763 (b'\xe1\xe1', FFFD*2), # 2 start bytes
1764 (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1765 (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1766 (b'\xe1\x80', FFFD), # only 1 continuation byte
1767 (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1768 (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1769 (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1770 (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1771 (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1772 (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1773 # with start byte of a 4-byte sequence
1774 (b'\xf1', FFFD), # only the start byte
1775 (b'\xf1\xf1', FFFD*2), # 2 start bytes
1776 (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1777 (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1778 (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1779 (b'\xf1\x80', FFFD), # only 1 continuation bytes
1780 (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1781 (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1782 (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1783 (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1784 (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1785 (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1786 (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1787 (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1788 (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1789 (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1790 (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1791 (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1792 # with invalid start byte of a 4-byte sequence (rfc2279)
1793 (b'\xf5', FFFD), # only the start byte
1794 (b'\xf5\xf5', FFFD*2), # 2 start bytes
1795 (b'\xf5\x80', FFFD*2), # only 1 continuation byte
1796 (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1797 (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1798 (b'\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
1799 (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1800 (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1801 # with invalid start byte of a 5-byte sequence (rfc2279)
1802 (b'\xf8', FFFD), # only the start byte
1803 (b'\xf8\xf8', FFFD*2), # 2 start bytes
1804 (b'\xf8\x80', FFFD*2), # only one continuation byte
1805 (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1806 (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1807 # with invalid start byte of a 6-byte sequence (rfc2279)
1808 (b'\xfc', FFFD), # only the start byte
1809 (b'\xfc\xfc', FFFD*2), # 2 start bytes
1810 (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1811 (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1812 # invalid start byte
1813 (b'\xfe', FFFD),
1814 (b'\xfe\x80\x80', FFFD*3),
1815 # other sequences
1816 (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1817 (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1818 (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1819 (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1820 '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1821 ]
1822 for n, (seq, res) in enumerate(sequences):
1823 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1824 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1825 self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1826 self.assertEqual(seq.decode('utf-8', 'ignore'),
1827 res.replace('\uFFFD', ''))
1828
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001829 def assertCorrectUTF8Decoding(self, seq, res, err):
1830 """
Martin Panter6245cb32016-04-15 02:14:19 +00001831 Check that an invalid UTF-8 sequence raises a UnicodeDecodeError when
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001832 'strict' is used, returns res when 'replace' is used, and that doesn't
1833 return anything when 'ignore' is used.
1834 """
1835 with self.assertRaises(UnicodeDecodeError) as cm:
1836 seq.decode('utf-8')
1837 exc = cm.exception
1838
1839 self.assertIn(err, str(exc))
1840 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1841 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'),
1842 'aaaa' + res + 'bbbb')
1843 res = res.replace('\ufffd', '')
1844 self.assertEqual(seq.decode('utf-8', 'ignore'), res)
1845 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'),
1846 'aaaa' + res + 'bbbb')
1847
1848 def test_invalid_start_byte(self):
1849 """
1850 Test that an 'invalid start byte' error is raised when the first byte
1851 is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
1852 4-bytes sequence. The invalid start byte is replaced with a single
1853 U+FFFD when errors='replace'.
1854 E.g. <80> is a continuation byte and can appear only after a start byte.
1855 """
1856 FFFD = '\ufffd'
1857 for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
1858 self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd',
1859 'invalid start byte')
1860
1861 def test_unexpected_end_of_data(self):
1862 """
1863 Test that an 'unexpected end of data' error is raised when the string
1864 ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
1865 enough continuation bytes. The incomplete sequence is replaced with a
1866 single U+FFFD when errors='replace'.
1867 E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
1868 sequence, but it's followed by only 2 valid continuation bytes and the
1869 last continuation bytes is missing.
1870 Note: the continuation bytes must be all valid, if one of them is
1871 invalid another error will be raised.
1872 """
1873 sequences = [
1874 'C2', 'DF',
1875 'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
1876 'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
1877 'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
1878 'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
1879 'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
1880 'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
1881 ]
1882 FFFD = '\ufffd'
1883 for seq in sequences:
Serhiy Storchaka8cbd3df2016-12-21 12:59:28 +02001884 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), '\ufffd',
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001885 'unexpected end of data')
1886
1887 def test_invalid_cb_for_2bytes_seq(self):
1888 """
1889 Test that an 'invalid continuation byte' error is raised when the
1890 continuation byte of a 2-bytes sequence is invalid. The start byte
1891 is replaced by a single U+FFFD and the second byte is handled
1892 separately when errors='replace'.
1893 E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
1894 sequence, but 41 is not a valid continuation byte because it's the
1895 ASCII letter 'A'.
1896 """
1897 FFFD = '\ufffd'
1898 FFFDx2 = FFFD * 2
1899 sequences = [
1900 ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'),
1901 ('C2 C0', FFFDx2), ('C2 FF', FFFDx2),
1902 ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'),
1903 ('DF C0', FFFDx2), ('DF FF', FFFDx2),
1904 ]
1905 for seq, res in sequences:
Serhiy Storchaka8cbd3df2016-12-21 12:59:28 +02001906 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001907 'invalid continuation byte')
1908
1909 def test_invalid_cb_for_3bytes_seq(self):
1910 """
1911 Test that an 'invalid continuation byte' error is raised when the
1912 continuation byte(s) of a 3-bytes sequence are invalid. When
1913 errors='replace', if the first continuation byte is valid, the first
1914 two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
1915 third byte is handled separately, otherwise only the start byte is
1916 replaced with a U+FFFD and the other continuation bytes are handled
1917 separately.
1918 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1919 sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1920 because it's the ASCII letter 'A'.
1921 Note: when the start byte is E0 or ED, the valid ranges for the first
1922 continuation byte are limited to A0..BF and 80..9F respectively.
1923 Python 2 used to consider all the bytes in range 80..BF valid when the
1924 start byte was ED. This is fixed in Python 3.
1925 """
1926 FFFD = '\ufffd'
1927 FFFDx2 = FFFD * 2
1928 sequences = [
1929 ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2),
1930 ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
1931 ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'),
1932 ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
1933 ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'),
1934 ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'),
1935 ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
1936 ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'),
1937 ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
1938 ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'),
1939 ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'),
1940 ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
1941 ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'),
1942 ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
1943 ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'),
1944 ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'),
1945 ('ED 7F', FFFD+'\x7f'),
1946 ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
1947 ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'),
1948 ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2),
1949 ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'),
1950 ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2),
1951 ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'),
1952 ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
1953 ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'),
1954 ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
1955 ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'),
1956 ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'),
1957 ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
1958 ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'),
1959 ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
1960 ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'),
1961 ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
1962 ]
1963 for seq, res in sequences:
Serhiy Storchaka8cbd3df2016-12-21 12:59:28 +02001964 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001965 'invalid continuation byte')
1966
1967 def test_invalid_cb_for_4bytes_seq(self):
1968 """
1969 Test that an 'invalid continuation byte' error is raised when the
1970 continuation byte(s) of a 4-bytes sequence are invalid. When
1971 errors='replace',the start byte and all the following valid
1972 continuation bytes are replaced with a single U+FFFD, and all the bytes
1973 starting from the first invalid continuation bytes (included) are
1974 handled separately.
1975 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1976 sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1977 because it's the ASCII letter 'A'.
1978 Note: when the start byte is E0 or ED, the valid ranges for the first
1979 continuation byte are limited to A0..BF and 80..9F respectively.
1980 However, when the start byte is ED, Python 2 considers all the bytes
1981 in range 80..BF valid. This is fixed in Python 3.
1982 """
1983 FFFD = '\ufffd'
1984 FFFDx2 = FFFD * 2
1985 sequences = [
1986 ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2),
1987 ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
1988 ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'),
1989 ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
1990 ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'),
1991 ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
1992 ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'),
1993 ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
1994 ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'),
1995 ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
1996 ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'),
1997 ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
1998 ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'),
1999 ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
2000 ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2),
2001 ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'),
2002 ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2),
2003 ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'),
2004 ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2),
2005 ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'),
2006 ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2),
2007 ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'),
2008 ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2),
2009 ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'),
2010 ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2),
2011 ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'),
2012 ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2),
2013 ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'),
2014 ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
2015 ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'),
2016 ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
2017 ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'),
2018 ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
2019 ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'),
2020 ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
2021 ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'),
2022 ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
2023 ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'),
2024 ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
2025 ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'),
2026 ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
2027 ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2),
2028 ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
2029 ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'),
2030 ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
2031 ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'),
2032 ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
2033 ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'),
2034 ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
2035 ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'),
2036 ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
2037 ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'),
2038 ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
2039 ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'),
2040 ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
2041 ]
2042 for seq, res in sequences:
Serhiy Storchaka8cbd3df2016-12-21 12:59:28 +02002043 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
Ezio Melottif7ed5d12012-11-04 23:21:38 +02002044 'invalid continuation byte')
2045
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00002046 def test_codecs_idna(self):
2047 # Test whether trailing dot is preserved
Walter Dörwald1324c6f2007-05-11 19:57:05 +00002048 self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00002049
Walter Dörwald28256f22003-01-19 16:59:20 +00002050 def test_codecs_errors(self):
2051 # Error handling (encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002052 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
2053 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
Walter Dörwald67e83882007-05-05 12:26:27 +00002054 self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
2055 self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
Benjamin Peterson308d6372009-09-18 21:42:35 +00002056 self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
2057 'Andr\202 x'.encode('ascii', errors='replace'))
2058 self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
2059 'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002060
Walter Dörwald28256f22003-01-19 16:59:20 +00002061 # Error handling (decoding)
Walter Dörwald67e83882007-05-05 12:26:27 +00002062 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
2063 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
2064 self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
2065 self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03002066 self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002067
Walter Dörwald28256f22003-01-19 16:59:20 +00002068 # Error handling (unknown character names)
Guido van Rossum39478e82007-08-27 17:23:59 +00002069 self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002070
Walter Dörwald28256f22003-01-19 16:59:20 +00002071 # Error handling (truncated escape sequence)
Guido van Rossum9c627722007-08-27 18:31:48 +00002072 self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00002073
Guido van Rossum9c627722007-08-27 18:31:48 +00002074 self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
2075 self.assertRaises(TypeError, str, b"hello", "test.unicode2")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002076 self.assertRaises(TypeError, "hello".encode, "test.unicode1")
2077 self.assertRaises(TypeError, "hello".encode, "test.unicode2")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00002078
Walter Dörwald28256f22003-01-19 16:59:20 +00002079 # Error handling (wrong arguments)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002080 self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002081
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02002082 # Error handling (lone surrogate in
2083 # _PyUnicode_TransformDecimalAndSpaceToASCII())
2084 self.assertRaises(ValueError, int, "\ud800")
2085 self.assertRaises(ValueError, int, "\udf00")
2086 self.assertRaises(ValueError, float, "\ud800")
2087 self.assertRaises(ValueError, float, "\udf00")
2088 self.assertRaises(ValueError, complex, "\ud800")
2089 self.assertRaises(ValueError, complex, "\udf00")
Guido van Rossum97064862000-04-10 13:52:48 +00002090
Walter Dörwald28256f22003-01-19 16:59:20 +00002091 def test_codecs(self):
2092 # Encoding
Walter Dörwald67e83882007-05-05 12:26:27 +00002093 self.assertEqual('hello'.encode('ascii'), b'hello')
2094 self.assertEqual('hello'.encode('utf-7'), b'hello')
2095 self.assertEqual('hello'.encode('utf-8'), b'hello')
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002096 self.assertEqual('hello'.encode('utf-8'), b'hello')
Walter Dörwald67e83882007-05-05 12:26:27 +00002097 self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
2098 self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
2099 self.assertEqual('hello'.encode('latin-1'), b'hello')
Guido van Rossum97064862000-04-10 13:52:48 +00002100
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002101 # Default encoding is utf-8
2102 self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
2103
Walter Dörwald28256f22003-01-19 16:59:20 +00002104 # Roundtrip safety for BMP (just the first 1024 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002105 for c in range(1024):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002106 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002107 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
2108 'utf-16-be', 'raw_unicode_escape',
Inada Naoki6a16b182019-03-18 15:44:11 +09002109 'unicode_escape'):
2110 self.assertEqual(str(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002111
Walter Dörwald28256f22003-01-19 16:59:20 +00002112 # Roundtrip safety for BMP (just the first 256 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002113 for c in range(256):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002114 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002115 for encoding in ('latin-1',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002116 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002117
Walter Dörwald28256f22003-01-19 16:59:20 +00002118 # Roundtrip safety for BMP (just the first 128 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002119 for c in range(128):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002120 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002121 for encoding in ('ascii',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002122 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002123
Walter Dörwald28256f22003-01-19 16:59:20 +00002124 # Roundtrip safety for non-BMP (just a few chars)
Victor Stinner040e16e2011-11-15 22:44:05 +01002125 with warnings.catch_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01002126 u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
2127 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
Inada Naoki6a16b182019-03-18 15:44:11 +09002128 'raw_unicode_escape', 'unicode_escape'):
Victor Stinner040e16e2011-11-15 22:44:05 +01002129 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002130
Antoine Pitrou51f66482011-11-11 13:35:44 +01002131 # UTF-8 must be roundtrip safe for all code points
2132 # (except surrogates, which are forbidden).
2133 u = ''.join(map(chr, list(range(0, 0xd800)) +
Ezio Melotti40dc9192011-11-11 17:00:46 +02002134 list(range(0xe000, 0x110000))))
Walter Dörwald28256f22003-01-19 16:59:20 +00002135 for encoding in ('utf-8',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002136 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002137
Walter Dörwald28256f22003-01-19 16:59:20 +00002138 def test_codecs_charmap(self):
2139 # 0-127
Guido van Rossum805365e2007-05-07 22:24:25 +00002140 s = bytes(range(128))
Walter Dörwald28256f22003-01-19 16:59:20 +00002141 for encoding in (
Andrew Kuchlingad8156e2013-11-10 13:44:30 -05002142 'cp037', 'cp1026', 'cp273',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00002143 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2144 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02002145 'cp863', 'cp865', 'cp866', 'cp1125',
Walter Dörwald28256f22003-01-19 16:59:20 +00002146 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2147 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03002148 'iso8859_7', 'iso8859_9',
2149 'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'latin_1',
Walter Dörwald28256f22003-01-19 16:59:20 +00002150 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002151
Walter Dörwald28256f22003-01-19 16:59:20 +00002152 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2153 'cp1256', 'cp1257', 'cp1258',
2154 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002155
Walter Dörwald28256f22003-01-19 16:59:20 +00002156 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2157 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +00002158
Walter Dörwald28256f22003-01-19 16:59:20 +00002159 ### These have undefined mappings:
2160 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +00002161
Walter Dörwald28256f22003-01-19 16:59:20 +00002162 ### These fail the round-trip:
2163 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +00002164
Walter Dörwald28256f22003-01-19 16:59:20 +00002165 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002166 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002167
Walter Dörwald28256f22003-01-19 16:59:20 +00002168 # 128-255
Guido van Rossum805365e2007-05-07 22:24:25 +00002169 s = bytes(range(128, 256))
Walter Dörwald28256f22003-01-19 16:59:20 +00002170 for encoding in (
Andrew Kuchlingad8156e2013-11-10 13:44:30 -05002171 'cp037', 'cp1026', 'cp273',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00002172 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2173 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02002174 'cp863', 'cp865', 'cp866', 'cp1125',
Walter Dörwald28256f22003-01-19 16:59:20 +00002175 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2176 'iso8859_2', 'iso8859_4', 'iso8859_5',
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03002177 'iso8859_9', 'koi8_r', 'koi8_u', 'latin_1',
Walter Dörwald28256f22003-01-19 16:59:20 +00002178 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +00002179
Walter Dörwald28256f22003-01-19 16:59:20 +00002180 ### These have undefined mappings:
2181 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2182 #'cp1256', 'cp1257', 'cp1258',
2183 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03002184 #'iso8859_3', 'iso8859_6', 'iso8859_7', 'koi8_t', 'kz1048',
Walter Dörwald28256f22003-01-19 16:59:20 +00002185 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +00002186
Walter Dörwald28256f22003-01-19 16:59:20 +00002187 ### These fail the round-trip:
2188 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +00002189
Walter Dörwald28256f22003-01-19 16:59:20 +00002190 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002191 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002192
Walter Dörwald28256f22003-01-19 16:59:20 +00002193 def test_concatenation(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002194 self.assertEqual(("abc" "def"), "abcdef")
2195 self.assertEqual(("abc" "def"), "abcdef")
2196 self.assertEqual(("abc" "def"), "abcdef")
2197 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2198 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +00002199
Walter Dörwald28256f22003-01-19 16:59:20 +00002200 def test_printing(self):
2201 class BitBucket:
2202 def write(self, text):
2203 pass
Fred Drake004d5e62000-10-23 17:22:08 +00002204
Walter Dörwald28256f22003-01-19 16:59:20 +00002205 out = BitBucket()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002206 print('abc', file=out)
2207 print('abc', 'def', file=out)
2208 print('abc', 'def', file=out)
2209 print('abc', 'def', file=out)
2210 print('abc\n', file=out)
2211 print('abc\n', end=' ', file=out)
2212 print('abc\n', end=' ', file=out)
2213 print('def\n', file=out)
2214 print('def\n', file=out)
Fred Drake004d5e62000-10-23 17:22:08 +00002215
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002216 def test_ucs4(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002217 x = '\U00100000'
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002218 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
2219 self.assertEqual(x, y)
2220
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002221 y = br'\U00100000'
2222 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2223 self.assertEqual(x, y)
2224 y = br'\U00010000'
2225 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2226 self.assertEqual(x, y)
Christian Heimesfe337bf2008-03-23 21:54:12 +00002227
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002228 try:
2229 br'\U11111111'.decode("raw-unicode-escape")
2230 except UnicodeDecodeError as e:
2231 self.assertEqual(e.start, 0)
2232 self.assertEqual(e.end, 10)
2233 else:
2234 self.fail("Should have raised UnicodeDecodeError")
Christian Heimesfe337bf2008-03-23 21:54:12 +00002235
Brett Cannonc3647ac2005-04-26 03:45:26 +00002236 def test_conversion(self):
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002237 # Make sure __str__() works properly
2238 class ObjectToStr:
Brett Cannonc3647ac2005-04-26 03:45:26 +00002239 def __str__(self):
2240 return "foo"
2241
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002242 class StrSubclassToStr(str):
Guido van Rossum98297ee2007-11-06 21:34:58 +00002243 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002244 return "foo"
Brett Cannonc3647ac2005-04-26 03:45:26 +00002245
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002246 class StrSubclassToStrSubclass(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002247 def __new__(cls, content=""):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002248 return str.__new__(cls, 2*content)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002249 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002250 return self
2251
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002252 self.assertEqual(str(ObjectToStr()), "foo")
2253 self.assertEqual(str(StrSubclassToStr("bar")), "foo")
2254 s = str(StrSubclassToStrSubclass("foo"))
2255 self.assertEqual(s, "foofoo")
2256 self.assertIs(type(s), StrSubclassToStrSubclass)
Serhiy Storchaka15095802015-11-25 15:47:01 +02002257 s = StrSubclass(StrSubclassToStrSubclass("foo"))
2258 self.assertEqual(s, "foofoo")
2259 self.assertIs(type(s), StrSubclass)
Brett Cannonc3647ac2005-04-26 03:45:26 +00002260
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002261 def test_unicode_repr(self):
2262 class s1:
2263 def __repr__(self):
2264 return '\\n'
2265
2266 class s2:
2267 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002268 return '\\n'
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002269
2270 self.assertEqual(repr(s1()), '\\n')
2271 self.assertEqual(repr(s2()), '\\n')
2272
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002273 def test_printable_repr(self):
2274 self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
Martin v. Löwisbaecd722010-10-11 22:42:28 +00002275 self.assertEqual(repr('\U00014000'), "'\\U00014000'") # nonprintable
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002276
Zachary Ware9fe6d862013-12-08 00:20:35 -06002277 # This test only affects 32-bit platforms because expandtabs can only take
2278 # an int as the max value, not a 64-bit C long. If expandtabs is changed
2279 # to take a 64-bit long, this test should apply to all platforms.
2280 @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4,
2281 'only applies to 32-bit platforms')
Guido van Rossumcd16bf62007-06-13 18:07:49 +00002282 def test_expandtabs_overflows_gracefully(self):
Christian Heimesa37d4c62007-12-04 23:02:19 +00002283 self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002284
Victor Stinner1d972ad2011-10-07 13:31:46 +02002285 @support.cpython_only
Antoine Pitroue19aa382011-10-04 16:04:01 +02002286 def test_expandtabs_optimization(self):
2287 s = 'abc'
2288 self.assertIs(s.expandtabs(), s)
2289
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00002290 def test_raiseMemError(self):
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002291 if struct.calcsize('P') == 8:
2292 # 64 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02002293 ascii_struct_size = 48
2294 compact_struct_size = 72
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002295 else:
2296 # 32 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02002297 ascii_struct_size = 24
2298 compact_struct_size = 36
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002299
2300 for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
2301 code = ord(char)
2302 if code < 0x100:
2303 char_size = 1 # sizeof(Py_UCS1)
2304 struct_size = ascii_struct_size
2305 elif code < 0x10000:
2306 char_size = 2 # sizeof(Py_UCS2)
2307 struct_size = compact_struct_size
2308 else:
2309 char_size = 4 # sizeof(Py_UCS4)
2310 struct_size = compact_struct_size
2311 # Note: sys.maxsize is half of the actual max allocation because of
Martin v. Löwis287eca62011-09-28 10:03:28 +02002312 # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
2313 # be allocatable, given enough memory.
2314 maxlen = ((sys.maxsize - struct_size) // char_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002315 alloc = lambda: char * maxlen
2316 self.assertRaises(MemoryError, alloc)
2317 self.assertRaises(MemoryError, alloc)
Antoine Pitrou3db3e872008-08-17 17:06:51 +00002318
Victor Stinner808fc0a2010-03-22 12:50:40 +00002319 def test_format_subclass(self):
2320 class S(str):
2321 def __str__(self):
2322 return '__str__ overridden'
2323 s = S('xxx')
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002324 self.assertEqual("%s" % s, '__str__ overridden')
2325 self.assertEqual("{}".format(s), '__str__ overridden')
Victor Stinner808fc0a2010-03-22 12:50:40 +00002326
Serhiy Storchaka63b5b6f2016-10-02 21:16:38 +03002327 def test_subclass_add(self):
2328 class S(str):
2329 def __add__(self, o):
2330 return "3"
2331 self.assertEqual(S("4") + S("5"), "3")
2332 class S(str):
2333 def __iadd__(self, o):
2334 return "3"
2335 s = S("1")
2336 s += "4"
2337 self.assertEqual(s, "3")
2338
2339 def test_getnewargs(self):
2340 text = 'abc'
2341 args = text.__getnewargs__()
2342 self.assertIsNot(args[0], text)
2343 self.assertEqual(args[0], text)
2344 self.assertEqual(len(args), 1)
2345
Inada Naoki6a16b182019-03-18 15:44:11 +09002346 @support.cpython_only
Serhiy Storchaka63b5b6f2016-10-02 21:16:38 +03002347 def test_resize(self):
Inada Naoki6a16b182019-03-18 15:44:11 +09002348 from _testcapi import getargs_u
Serhiy Storchaka63b5b6f2016-10-02 21:16:38 +03002349 for length in range(1, 100, 7):
2350 # generate a fresh string (refcount=1)
2351 text = 'a' * length + 'b'
2352
Inada Naoki6a16b182019-03-18 15:44:11 +09002353 # fill wstr internal field
2354 abc = getargs_u(text)
2355 self.assertEqual(abc, text)
Serhiy Storchaka63b5b6f2016-10-02 21:16:38 +03002356
Inada Naoki6a16b182019-03-18 15:44:11 +09002357 # resize text: wstr field must be cleared and then recomputed
2358 text += 'c'
2359 abcdef = getargs_u(text)
2360 self.assertNotEqual(abc, abcdef)
2361 self.assertEqual(abcdef, text)
Serhiy Storchaka63b5b6f2016-10-02 21:16:38 +03002362
2363 def test_compare(self):
2364 # Issue #17615
2365 N = 10
2366 ascii = 'a' * N
2367 ascii2 = 'z' * N
2368 latin = '\x80' * N
2369 latin2 = '\xff' * N
2370 bmp = '\u0100' * N
2371 bmp2 = '\uffff' * N
2372 astral = '\U00100000' * N
2373 astral2 = '\U0010ffff' * N
2374 strings = (
2375 ascii, ascii2,
2376 latin, latin2,
2377 bmp, bmp2,
2378 astral, astral2)
2379 for text1, text2 in itertools.combinations(strings, 2):
2380 equal = (text1 is text2)
2381 self.assertEqual(text1 == text2, equal)
2382 self.assertEqual(text1 != text2, not equal)
2383
2384 if equal:
2385 self.assertTrue(text1 <= text2)
2386 self.assertTrue(text1 >= text2)
2387
2388 # text1 is text2: duplicate strings to skip the "str1 == str2"
2389 # optimization in unicode_compare_eq() and really compare
2390 # character per character
2391 copy1 = duplicate_string(text1)
2392 copy2 = duplicate_string(text2)
2393 self.assertIsNot(copy1, copy2)
2394
2395 self.assertTrue(copy1 == copy2)
2396 self.assertFalse(copy1 != copy2)
2397
2398 self.assertTrue(copy1 <= copy2)
2399 self.assertTrue(copy2 >= copy2)
2400
2401 self.assertTrue(ascii < ascii2)
2402 self.assertTrue(ascii < latin)
2403 self.assertTrue(ascii < bmp)
2404 self.assertTrue(ascii < astral)
2405 self.assertFalse(ascii >= ascii2)
2406 self.assertFalse(ascii >= latin)
2407 self.assertFalse(ascii >= bmp)
2408 self.assertFalse(ascii >= astral)
2409
2410 self.assertFalse(latin < ascii)
2411 self.assertTrue(latin < latin2)
2412 self.assertTrue(latin < bmp)
2413 self.assertTrue(latin < astral)
2414 self.assertTrue(latin >= ascii)
2415 self.assertFalse(latin >= latin2)
2416 self.assertFalse(latin >= bmp)
2417 self.assertFalse(latin >= astral)
2418
2419 self.assertFalse(bmp < ascii)
2420 self.assertFalse(bmp < latin)
2421 self.assertTrue(bmp < bmp2)
2422 self.assertTrue(bmp < astral)
2423 self.assertTrue(bmp >= ascii)
2424 self.assertTrue(bmp >= latin)
2425 self.assertFalse(bmp >= bmp2)
2426 self.assertFalse(bmp >= astral)
2427
2428 self.assertFalse(astral < ascii)
2429 self.assertFalse(astral < latin)
2430 self.assertFalse(astral < bmp2)
2431 self.assertTrue(astral < astral2)
2432 self.assertTrue(astral >= ascii)
2433 self.assertTrue(astral >= latin)
2434 self.assertTrue(astral >= bmp2)
2435 self.assertFalse(astral >= astral2)
2436
2437 def test_free_after_iterating(self):
2438 support.check_free_after_iterating(self, iter, str)
2439 support.check_free_after_iterating(self, reversed, str)
2440
Victor Stinner22eb6892019-06-26 00:51:05 +02002441 def test_check_encoding_errors(self):
2442 # bpo-37388: str(bytes) and str.decode() must check encoding and errors
2443 # arguments in dev mode
2444 encodings = ('ascii', 'utf8', 'latin1')
2445 invalid = 'Boom, Shaka Laka, Boom!'
2446 code = textwrap.dedent(f'''
2447 import sys
2448 encodings = {encodings!r}
2449
2450 for data in (b'', b'short string'):
2451 try:
2452 str(data, encoding={invalid!r})
2453 except LookupError:
2454 pass
2455 else:
2456 sys.exit(21)
2457
2458 try:
2459 str(data, errors={invalid!r})
2460 except LookupError:
2461 pass
2462 else:
2463 sys.exit(22)
2464
2465 for encoding in encodings:
2466 try:
2467 str(data, encoding, errors={invalid!r})
2468 except LookupError:
2469 pass
2470 else:
2471 sys.exit(22)
2472
2473 for data in ('', 'short string'):
2474 try:
2475 data.encode(encoding={invalid!r})
2476 except LookupError:
2477 pass
2478 else:
2479 sys.exit(23)
2480
2481 try:
2482 data.encode(errors={invalid!r})
2483 except LookupError:
2484 pass
2485 else:
2486 sys.exit(24)
2487
2488 for encoding in encodings:
2489 try:
2490 data.encode(encoding, errors={invalid!r})
2491 except LookupError:
2492 pass
2493 else:
2494 sys.exit(24)
2495
2496 sys.exit(10)
2497 ''')
2498 proc = assert_python_failure('-X', 'dev', '-c', code)
2499 self.assertEqual(proc.rc, 10, proc)
2500
Serhiy Storchaka63b5b6f2016-10-02 21:16:38 +03002501
2502class CAPITest(unittest.TestCase):
2503
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002504 # Test PyUnicode_FromFormat()
Victor Stinner1205f272010-09-11 00:54:47 +00002505 def test_from_format(self):
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002506 support.import_module('ctypes')
Victor Stinner15a11362012-10-06 23:48:20 +02002507 from ctypes import (
2508 pythonapi, py_object, sizeof,
Victor Stinner6d970f42011-03-02 00:04:25 +00002509 c_int, c_long, c_longlong, c_ssize_t,
Victor Stinner15a11362012-10-06 23:48:20 +02002510 c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002511 name = "PyUnicode_FromFormat"
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002512 _PyUnicode_FromFormat = getattr(pythonapi, name)
2513 _PyUnicode_FromFormat.restype = py_object
2514
2515 def PyUnicode_FromFormat(format, *args):
2516 cargs = tuple(
2517 py_object(arg) if isinstance(arg, str) else arg
2518 for arg in args)
2519 return _PyUnicode_FromFormat(format, *cargs)
Victor Stinner1205f272010-09-11 00:54:47 +00002520
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002521 def check_format(expected, format, *args):
2522 text = PyUnicode_FromFormat(format, *args)
2523 self.assertEqual(expected, text)
2524
Victor Stinner1205f272010-09-11 00:54:47 +00002525 # ascii format, non-ascii argument
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002526 check_format('ascii\x7f=unicode\xe9',
2527 b'ascii\x7f=%U', 'unicode\xe9')
Victor Stinner1205f272010-09-11 00:54:47 +00002528
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002529 # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
2530 # raises an error
Ezio Melottied3a7d22010-12-01 02:32:32 +00002531 self.assertRaisesRegex(ValueError,
R David Murray44b548d2016-09-08 13:59:53 -04002532 r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
Victor Stinner4c7db312010-09-12 07:51:18 +00002533 'string, got a non-ASCII byte: 0xe9$',
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002534 PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00002535
Victor Stinner96865452011-03-01 23:44:09 +00002536 # test "%c"
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002537 check_format('\uabcd',
2538 b'%c', c_int(0xabcd))
2539 check_format('\U0010ffff',
2540 b'%c', c_int(0x10ffff))
Serhiy Storchaka8eeae212013-06-23 20:12:14 +03002541 with self.assertRaises(OverflowError):
2542 PyUnicode_FromFormat(b'%c', c_int(0x110000))
Serhiy Storchaka31b1c8b2013-06-12 09:20:44 +03002543 # Issue #18183
Serhiy Storchakaf15ffe02013-06-12 09:28:20 +03002544 check_format('\U00010000\U00100000',
2545 b'%c%c', c_int(0x10000), c_int(0x100000))
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002546
Victor Stinner96865452011-03-01 23:44:09 +00002547 # test "%"
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002548 check_format('%',
2549 b'%')
2550 check_format('%',
2551 b'%%')
2552 check_format('%s',
2553 b'%%s')
2554 check_format('[%]',
2555 b'[%%]')
2556 check_format('%abc',
2557 b'%%%s', b'abc')
2558
2559 # truncated string
2560 check_format('abc',
2561 b'%.3s', b'abcdef')
2562 check_format('abc[\ufffd',
2563 b'%.5s', 'abc[\u20ac]'.encode('utf8'))
2564 check_format("'\\u20acABC'",
2565 b'%A', '\u20acABC')
2566 check_format("'\\u20",
2567 b'%.5A', '\u20acABCDEF')
2568 check_format("'\u20acABC'",
2569 b'%R', '\u20acABC')
2570 check_format("'\u20acA",
2571 b'%.3R', '\u20acABCDEF')
2572 check_format('\u20acAB',
2573 b'%.3S', '\u20acABCDEF')
2574 check_format('\u20acAB',
2575 b'%.3U', '\u20acABCDEF')
2576 check_format('\u20acAB',
2577 b'%.3V', '\u20acABCDEF', None)
2578 check_format('abc[\ufffd',
2579 b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
2580
2581 # following tests comes from #7330
2582 # test width modifier and precision modifier with %S
2583 check_format("repr= abc",
2584 b'repr=%5S', 'abc')
2585 check_format("repr=ab",
2586 b'repr=%.2S', 'abc')
2587 check_format("repr= ab",
2588 b'repr=%5.2S', 'abc')
2589
2590 # test width modifier and precision modifier with %R
2591 check_format("repr= 'abc'",
2592 b'repr=%8R', 'abc')
2593 check_format("repr='ab",
2594 b'repr=%.3R', 'abc')
2595 check_format("repr= 'ab",
2596 b'repr=%5.3R', 'abc')
2597
2598 # test width modifier and precision modifier with %A
2599 check_format("repr= 'abc'",
2600 b'repr=%8A', 'abc')
2601 check_format("repr='ab",
2602 b'repr=%.3A', 'abc')
2603 check_format("repr= 'ab",
2604 b'repr=%5.3A', 'abc')
2605
2606 # test width modifier and precision modifier with %s
2607 check_format("repr= abc",
2608 b'repr=%5s', b'abc')
2609 check_format("repr=ab",
2610 b'repr=%.2s', b'abc')
2611 check_format("repr= ab",
2612 b'repr=%5.2s', b'abc')
2613
2614 # test width modifier and precision modifier with %U
2615 check_format("repr= abc",
2616 b'repr=%5U', 'abc')
2617 check_format("repr=ab",
2618 b'repr=%.2U', 'abc')
2619 check_format("repr= ab",
2620 b'repr=%5.2U', 'abc')
2621
2622 # test width modifier and precision modifier with %V
2623 check_format("repr= abc",
2624 b'repr=%5V', 'abc', b'123')
2625 check_format("repr=ab",
2626 b'repr=%.2V', 'abc', b'123')
2627 check_format("repr= ab",
2628 b'repr=%5.2V', 'abc', b'123')
2629 check_format("repr= 123",
2630 b'repr=%5V', None, b'123')
2631 check_format("repr=12",
2632 b'repr=%.2V', None, b'123')
2633 check_format("repr= 12",
2634 b'repr=%5.2V', None, b'123')
Victor Stinner96865452011-03-01 23:44:09 +00002635
Victor Stinner6d970f42011-03-02 00:04:25 +00002636 # test integer formats (%i, %d, %u)
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002637 check_format('010',
2638 b'%03i', c_int(10))
2639 check_format('0010',
2640 b'%0.4i', c_int(10))
2641 check_format('-123',
2642 b'%i', c_int(-123))
2643 check_format('-123',
2644 b'%li', c_long(-123))
2645 check_format('-123',
2646 b'%lli', c_longlong(-123))
2647 check_format('-123',
2648 b'%zi', c_ssize_t(-123))
Victor Stinner96865452011-03-01 23:44:09 +00002649
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002650 check_format('-123',
2651 b'%d', c_int(-123))
2652 check_format('-123',
2653 b'%ld', c_long(-123))
2654 check_format('-123',
2655 b'%lld', c_longlong(-123))
2656 check_format('-123',
2657 b'%zd', c_ssize_t(-123))
Victor Stinner96865452011-03-01 23:44:09 +00002658
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002659 check_format('123',
2660 b'%u', c_uint(123))
2661 check_format('123',
2662 b'%lu', c_ulong(123))
2663 check_format('123',
2664 b'%llu', c_ulonglong(123))
2665 check_format('123',
2666 b'%zu', c_size_t(123))
Victor Stinner6d970f42011-03-02 00:04:25 +00002667
Victor Stinner15a11362012-10-06 23:48:20 +02002668 # test long output
2669 min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
2670 max_longlong = -min_longlong - 1
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002671 check_format(str(min_longlong),
2672 b'%lld', c_longlong(min_longlong))
2673 check_format(str(max_longlong),
2674 b'%lld', c_longlong(max_longlong))
Victor Stinner15a11362012-10-06 23:48:20 +02002675 max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002676 check_format(str(max_ulonglong),
2677 b'%llu', c_ulonglong(max_ulonglong))
Victor Stinner15a11362012-10-06 23:48:20 +02002678 PyUnicode_FromFormat(b'%p', c_void_p(-1))
2679
Victor Stinnere215d962012-10-06 23:03:36 +02002680 # test padding (width and/or precision)
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002681 check_format('123'.rjust(10, '0'),
2682 b'%010i', c_int(123))
2683 check_format('123'.rjust(100),
2684 b'%100i', c_int(123))
2685 check_format('123'.rjust(100, '0'),
2686 b'%.100i', c_int(123))
2687 check_format('123'.rjust(80, '0').rjust(100),
2688 b'%100.80i', c_int(123))
Victor Stinnere215d962012-10-06 23:03:36 +02002689
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002690 check_format('123'.rjust(10, '0'),
2691 b'%010u', c_uint(123))
2692 check_format('123'.rjust(100),
2693 b'%100u', c_uint(123))
2694 check_format('123'.rjust(100, '0'),
2695 b'%.100u', c_uint(123))
2696 check_format('123'.rjust(80, '0').rjust(100),
2697 b'%100.80u', c_uint(123))
Victor Stinnere215d962012-10-06 23:03:36 +02002698
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002699 check_format('123'.rjust(10, '0'),
2700 b'%010x', c_int(0x123))
2701 check_format('123'.rjust(100),
2702 b'%100x', c_int(0x123))
2703 check_format('123'.rjust(100, '0'),
2704 b'%.100x', c_int(0x123))
2705 check_format('123'.rjust(80, '0').rjust(100),
2706 b'%100.80x', c_int(0x123))
Victor Stinnere215d962012-10-06 23:03:36 +02002707
Victor Stinner6d970f42011-03-02 00:04:25 +00002708 # test %A
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002709 check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
2710 b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
Victor Stinner9a909002010-10-18 20:59:24 +00002711
Victor Stinner6d970f42011-03-02 00:04:25 +00002712 # test %V
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002713 check_format('repr=abc',
2714 b'repr=%V', 'abc', b'xyz')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002715
2716 # Test string decode from parameter of %s using utf-8.
2717 # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
2718 # '\u4eba\u6c11'
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002719 check_format('repr=\u4eba\u6c11',
2720 b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002721
2722 #Test replace error handler.
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002723 check_format('repr=abc\ufffd',
2724 b'repr=%V', None, b'abc\xff')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002725
Victor Stinner6d970f42011-03-02 00:04:25 +00002726 # not supported: copy the raw format string. these tests are just here
Martin Panter2f9171d2016-12-18 01:23:09 +00002727 # to check for crashes and should not be considered as specifications
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002728 check_format('%s',
2729 b'%1%s', b'abc')
2730 check_format('%1abc',
2731 b'%1abc')
2732 check_format('%+i',
2733 b'%+i', c_int(10))
2734 check_format('%.%s',
2735 b'%.%s', b'abc')
Victor Stinner6d970f42011-03-02 00:04:25 +00002736
Serhiy Storchaka44cc4822019-01-12 09:22:29 +02002737 # Issue #33817: empty strings
2738 check_format('',
2739 b'')
2740 check_format('',
2741 b'%s', b'')
2742
Victor Stinner1c24bd02010-10-02 11:03:13 +00002743 # Test PyUnicode_AsWideChar()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002744 @support.cpython_only
Victor Stinner1c24bd02010-10-02 11:03:13 +00002745 def test_aswidechar(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002746 from _testcapi import unicode_aswidechar
Antoine Pitrou0662bc22010-11-22 16:19:04 +00002747 support.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002748 from ctypes import c_wchar, sizeof
2749
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002750 wchar, size = unicode_aswidechar('abcdef', 2)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002751 self.assertEqual(size, 2)
2752 self.assertEqual(wchar, 'ab')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002753
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002754 wchar, size = unicode_aswidechar('abc', 3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002755 self.assertEqual(size, 3)
2756 self.assertEqual(wchar, 'abc')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002757
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002758 wchar, size = unicode_aswidechar('abc', 4)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002759 self.assertEqual(size, 3)
2760 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002761
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002762 wchar, size = unicode_aswidechar('abc', 10)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002763 self.assertEqual(size, 3)
2764 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002765
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002766 wchar, size = unicode_aswidechar('abc\0def', 20)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002767 self.assertEqual(size, 7)
2768 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002769
Victor Stinner5593d8a2010-10-02 11:11:27 +00002770 nonbmp = chr(0x10ffff)
2771 if sizeof(c_wchar) == 2:
2772 buflen = 3
2773 nchar = 2
2774 else: # sizeof(c_wchar) == 4
2775 buflen = 2
2776 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002777 wchar, size = unicode_aswidechar(nonbmp, buflen)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002778 self.assertEqual(size, nchar)
2779 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00002780
Victor Stinner1c24bd02010-10-02 11:03:13 +00002781 # Test PyUnicode_AsWideCharString()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002782 @support.cpython_only
Victor Stinner1c24bd02010-10-02 11:03:13 +00002783 def test_aswidecharstring(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002784 from _testcapi import unicode_aswidecharstring
Antoine Pitrou0662bc22010-11-22 16:19:04 +00002785 support.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002786 from ctypes import c_wchar, sizeof
2787
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002788 wchar, size = unicode_aswidecharstring('abc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00002789 self.assertEqual(size, 3)
2790 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002791
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002792 wchar, size = unicode_aswidecharstring('abc\0def')
Ezio Melottib3aedd42010-11-20 19:04:17 +00002793 self.assertEqual(size, 7)
2794 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002795
Victor Stinner5593d8a2010-10-02 11:11:27 +00002796 nonbmp = chr(0x10ffff)
2797 if sizeof(c_wchar) == 2:
2798 nchar = 2
2799 else: # sizeof(c_wchar) == 4
2800 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002801 wchar, size = unicode_aswidecharstring(nonbmp)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002802 self.assertEqual(size, nchar)
2803 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00002804
Serhiy Storchakacc164232016-10-02 21:29:26 +03002805 # Test PyUnicode_AsUCS4()
2806 @support.cpython_only
2807 def test_asucs4(self):
2808 from _testcapi import unicode_asucs4
2809 for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600',
2810 'a\ud800b\udfffc', '\ud834\udd1e']:
2811 l = len(s)
2812 self.assertEqual(unicode_asucs4(s, l, 1), s+'\0')
2813 self.assertEqual(unicode_asucs4(s, l, 0), s+'\uffff')
2814 self.assertEqual(unicode_asucs4(s, l+1, 1), s+'\0\uffff')
2815 self.assertEqual(unicode_asucs4(s, l+1, 0), s+'\0\uffff')
2816 self.assertRaises(SystemError, unicode_asucs4, s, l-1, 1)
2817 self.assertRaises(SystemError, unicode_asucs4, s, l-2, 0)
2818 s = '\0'.join([s, s])
2819 self.assertEqual(unicode_asucs4(s, len(s), 1), s+'\0')
2820 self.assertEqual(unicode_asucs4(s, len(s), 0), s+'\uffff')
2821
Xiang Zhangb2110682016-12-20 22:52:33 +08002822 # Test PyUnicode_FindChar()
2823 @support.cpython_only
2824 def test_findchar(self):
2825 from _testcapi import unicode_findchar
2826
2827 for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1":
2828 for i, ch in enumerate(str):
2829 self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i)
2830 self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i)
2831
2832 str = "!>_<!"
2833 self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1)
2834 self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1)
2835 # start < end
2836 self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4)
2837 self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4)
2838 # start >= end
2839 self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1)
2840 self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1)
2841 # negative
2842 self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0)
2843 self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0)
2844
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03002845 # Test PyUnicode_CopyCharacters()
2846 @support.cpython_only
2847 def test_copycharacters(self):
2848 from _testcapi import unicode_copycharacters
2849
2850 strings = [
2851 'abcde', '\xa1\xa2\xa3\xa4\xa5',
2852 '\u4f60\u597d\u4e16\u754c\uff01',
2853 '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604'
2854 ]
2855
2856 for idx, from_ in enumerate(strings):
2857 # wide -> narrow: exceed maxchar limitation
2858 for to in strings[:idx]:
2859 self.assertRaises(
2860 SystemError,
2861 unicode_copycharacters, to, 0, from_, 0, 5
2862 )
2863 # same kind
2864 for from_start in range(5):
2865 self.assertEqual(
2866 unicode_copycharacters(from_, 0, from_, from_start, 5),
2867 (from_[from_start:from_start+5].ljust(5, '\0'),
2868 5-from_start)
2869 )
2870 for to_start in range(5):
2871 self.assertEqual(
2872 unicode_copycharacters(from_, to_start, from_, to_start, 5),
2873 (from_[to_start:to_start+5].rjust(5, '\0'),
2874 5-to_start)
2875 )
2876 # narrow -> wide
2877 # Tests omitted since this creates invalid strings.
2878
2879 s = strings[0]
2880 self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5)
2881 self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5)
2882 self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5)
2883 self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5)
2884 self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5)
2885 self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1)
2886 self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)
2887
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002888 @support.cpython_only
Victor Stinner42bf7752011-11-21 22:52:58 +01002889 def test_encode_decimal(self):
2890 from _testcapi import unicode_encodedecimal
2891 self.assertEqual(unicode_encodedecimal('123'),
2892 b'123')
2893 self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
2894 b'3.14')
2895 self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2896 b' 3.14 ')
2897 self.assertRaises(UnicodeEncodeError,
2898 unicode_encodedecimal, "123\u20ac", "strict")
Victor Stinner6345be92011-11-25 20:09:01 +01002899 self.assertRaisesRegex(
2900 ValueError,
2901 "^'decimal' codec can't encode character",
2902 unicode_encodedecimal, "123\u20ac", "replace")
Victor Stinner42bf7752011-11-21 22:52:58 +01002903
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002904 @support.cpython_only
Victor Stinner42bf7752011-11-21 22:52:58 +01002905 def test_transform_decimal(self):
2906 from _testcapi import unicode_transformdecimaltoascii as transform_decimal
2907 self.assertEqual(transform_decimal('123'),
2908 '123')
2909 self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
2910 '3.14')
2911 self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2912 "\N{EM SPACE}3.14\N{EN SPACE}")
2913 self.assertEqual(transform_decimal('123\u20ac'),
2914 '123\u20ac')
2915
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02002916 @support.cpython_only
2917 def test_pep393_utf8_caching_bug(self):
2918 # Issue #25709: Problem with string concatenation and utf-8 cache
2919 from _testcapi import getargs_s_hash
2920 for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
2921 s = ''
2922 for i in range(5):
2923 # Due to CPython specific optimization the 's' string can be
2924 # resized in-place.
2925 s += chr(k)
2926 # Parsing with the "s#" format code calls indirectly
2927 # PyUnicode_AsUTF8AndSize() which creates the UTF-8
2928 # encoded string cached in the Unicode object.
2929 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2930 # Check that the second call returns the same result
2931 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2932
Eric Smitha1eac722011-01-29 11:15:35 +00002933class StringModuleTest(unittest.TestCase):
2934 def test_formatter_parser(self):
2935 def parse(format):
2936 return list(_string.formatter_parser(format))
2937
2938 formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
2939 self.assertEqual(formatter, [
2940 ('prefix ', '2', '', 's'),
2941 ('xxx', '0', '^+10.3f', None),
2942 ('', 'obj.attr', '', 's'),
2943 (' ', 'z[0]', '10', 's'),
2944 ])
2945
2946 formatter = parse("prefix {} suffix")
2947 self.assertEqual(formatter, [
2948 ('prefix ', '', '', None),
2949 (' suffix', None, None, None),
2950 ])
2951
2952 formatter = parse("str")
2953 self.assertEqual(formatter, [
2954 ('str', None, None, None),
2955 ])
2956
2957 formatter = parse("")
2958 self.assertEqual(formatter, [])
2959
2960 formatter = parse("{0}")
2961 self.assertEqual(formatter, [
2962 ('', '0', '', None),
2963 ])
2964
2965 self.assertRaises(TypeError, _string.formatter_parser, 1)
2966
2967 def test_formatter_field_name_split(self):
2968 def split(name):
2969 items = list(_string.formatter_field_name_split(name))
2970 items[1] = list(items[1])
2971 return items
2972 self.assertEqual(split("obj"), ["obj", []])
2973 self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
2974 self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
2975 self.assertEqual(split("obj.arg[key1][key2]"), [
2976 "obj",
2977 [(True, 'arg'),
2978 (False, 'key1'),
2979 (False, 'key2'),
2980 ]])
2981 self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
2982
2983
Walter Dörwald28256f22003-01-19 16:59:20 +00002984if __name__ == "__main__":
Ezio Melotti0dceb562013-01-10 07:43:26 +02002985 unittest.main()