blob: 9ab624e6fc5db6527764f03dca70b0f4e04e0d53 [file] [log] [blame]
Guido van Rossuma831cac2000-03-10 23:23:21 +00001""" Test script for the Unicode implementation.
2
Guido van Rossuma831cac2000-03-10 23:23:21 +00003Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
Marc-André Lemburg36619082001-01-17 19:11:13 +00007"""#"
Victor Stinner040e16e2011-11-15 22:44:05 +01008import _string
Guido van Rossum98297ee2007-11-06 21:34:58 +00009import codecs
Victor Stinner9fc59812013-04-08 22:34:43 +020010import itertools
Ethan Furman9ab74802014-03-21 06:38:46 -070011import operator
Guido van Rossum98297ee2007-11-06 21:34:58 +000012import struct
13import sys
14import unittest
15import warnings
Benjamin Petersonee8712c2008-05-20 21:35:26 +000016from test import support, string_tests
Guido van Rossuma831cac2000-03-10 23:23:21 +000017
Neal Norwitz430f68b2005-11-24 22:00:56 +000018# Error handling (bad decoder return)
19def search_function(encoding):
20 def decode1(input, errors="strict"):
21 return 42 # not a tuple
22 def encode1(input, errors="strict"):
23 return 42 # not a tuple
24 def encode2(input, errors="strict"):
25 return (42, 42) # no unicode
26 def decode2(input, errors="strict"):
27 return (42, 42) # no unicode
28 if encoding=="test.unicode1":
29 return (encode1, decode1, None, None)
30 elif encoding=="test.unicode2":
31 return (encode2, decode2, None, None)
32 else:
33 return None
34codecs.register(search_function)
35
Victor Stinner9fc59812013-04-08 22:34:43 +020036def duplicate_string(text):
37 """
38 Try to get a fresh clone of the specified text:
39 new object with a reference count of 1.
40
41 This is a best-effort: latin1 single letters and the empty
42 string ('') are singletons and cannot be cloned.
43 """
44 return text.encode().decode()
45
Serhiy Storchaka15095802015-11-25 15:47:01 +020046class StrSubclass(str):
47 pass
48
Brett Cannon226b2302010-03-20 22:22:22 +000049class UnicodeTest(string_tests.CommonTest,
50 string_tests.MixinStrUnicodeUserStringTest,
Ezio Melotti0dceb562013-01-10 07:43:26 +020051 string_tests.MixinStrUnicodeTest,
52 unittest.TestCase):
Brett Cannon226b2302010-03-20 22:22:22 +000053
Guido van Rossumef87d6e2007-05-02 19:09:54 +000054 type2test = str
Walter Dörwald0fd583c2003-02-21 12:53:50 +000055
56 def checkequalnofix(self, result, object, methodname, *args):
57 method = getattr(object, methodname)
58 realresult = method(*args)
59 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000060 self.assertTrue(type(realresult) is type(result))
Walter Dörwald0fd583c2003-02-21 12:53:50 +000061
62 # if the original is returned make sure that
63 # this doesn't happen with subclasses
64 if realresult is object:
Guido van Rossumef87d6e2007-05-02 19:09:54 +000065 class usub(str):
Walter Dörwald0fd583c2003-02-21 12:53:50 +000066 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000067 return 'usub(%r)' % str.__repr__(self)
Walter Dörwald0fd583c2003-02-21 12:53:50 +000068 object = usub(object)
69 method = getattr(object, methodname)
70 realresult = method(*args)
71 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000072 self.assertTrue(object is not realresult)
Guido van Rossume4874ae2001-09-21 15:36:41 +000073
Jeremy Hylton504de6b2003-10-06 05:08:26 +000074 def test_literals(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000075 self.assertEqual('\xff', '\u00ff')
76 self.assertEqual('\uffff', '\U0000ffff')
Guido van Rossum36e0a922007-07-20 04:05:57 +000077 self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
78 self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
79 self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
Benjamin Petersoncd76c272008-04-05 15:09:30 +000080 # raw strings should not have unicode escapes
Florent Xiclunaa87b3832010-09-13 02:28:18 +000081 self.assertNotEqual(r"\u0020", " ")
Jeremy Hylton504de6b2003-10-06 05:08:26 +000082
Georg Brandl559e5d72008-06-11 18:37:52 +000083 def test_ascii(self):
84 if not sys.platform.startswith('java'):
85 # Test basic sanity of repr()
86 self.assertEqual(ascii('abc'), "'abc'")
87 self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
88 self.assertEqual(ascii('ab\\'), "'ab\\\\'")
89 self.assertEqual(ascii('\\c'), "'\\\\c'")
90 self.assertEqual(ascii('\\'), "'\\\\'")
91 self.assertEqual(ascii('\n'), "'\\n'")
92 self.assertEqual(ascii('\r'), "'\\r'")
93 self.assertEqual(ascii('\t'), "'\\t'")
94 self.assertEqual(ascii('\b'), "'\\x08'")
95 self.assertEqual(ascii("'\""), """'\\'"'""")
96 self.assertEqual(ascii("'\""), """'\\'"'""")
97 self.assertEqual(ascii("'"), '''"'"''')
98 self.assertEqual(ascii('"'), """'"'""")
99 latin1repr = (
100 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
101 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
102 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
103 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
104 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
105 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
106 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
107 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
108 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
109 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
110 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
111 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
112 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
113 "\\xfe\\xff'")
114 testrepr = ascii(''.join(map(chr, range(256))))
115 self.assertEqual(testrepr, latin1repr)
116 # Test ascii works on wide unicode escapes without overflow.
117 self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
118 ascii("\U00010000" * 39 + "\uffff" * 4096))
119
120 class WrongRepr:
121 def __repr__(self):
122 return b'byte-repr'
123 self.assertRaises(TypeError, ascii, WrongRepr())
124
Walter Dörwald28256f22003-01-19 16:59:20 +0000125 def test_repr(self):
126 if not sys.platform.startswith('java'):
127 # Test basic sanity of repr()
Walter Dörwald67e83882007-05-05 12:26:27 +0000128 self.assertEqual(repr('abc'), "'abc'")
129 self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
130 self.assertEqual(repr('ab\\'), "'ab\\\\'")
131 self.assertEqual(repr('\\c'), "'\\\\c'")
132 self.assertEqual(repr('\\'), "'\\\\'")
133 self.assertEqual(repr('\n'), "'\\n'")
134 self.assertEqual(repr('\r'), "'\\r'")
135 self.assertEqual(repr('\t'), "'\\t'")
136 self.assertEqual(repr('\b'), "'\\x08'")
137 self.assertEqual(repr("'\""), """'\\'"'""")
138 self.assertEqual(repr("'\""), """'\\'"'""")
139 self.assertEqual(repr("'"), '''"'"''')
140 self.assertEqual(repr('"'), """'"'""")
Walter Dörwald28256f22003-01-19 16:59:20 +0000141 latin1repr = (
Walter Dörwald67e83882007-05-05 12:26:27 +0000142 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
Walter Dörwald28256f22003-01-19 16:59:20 +0000143 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
144 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
145 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
146 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
147 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
Georg Brandl559e5d72008-06-11 18:37:52 +0000148 "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
149 "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
150 "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
151 "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
152 "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
153 "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
154 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
155 "\xfe\xff'")
Guido van Rossum805365e2007-05-07 22:24:25 +0000156 testrepr = repr(''.join(map(chr, range(256))))
Walter Dörwald28256f22003-01-19 16:59:20 +0000157 self.assertEqual(testrepr, latin1repr)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000158 # Test repr works on wide unicode escapes without overflow.
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000159 self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
160 repr("\U00010000" * 39 + "\uffff" * 4096))
Walter Dörwald28256f22003-01-19 16:59:20 +0000161
Georg Brandl559e5d72008-06-11 18:37:52 +0000162 class WrongRepr:
163 def __repr__(self):
164 return b'byte-repr'
165 self.assertRaises(TypeError, repr, WrongRepr())
166
Guido van Rossum49d6b072006-08-17 21:11:47 +0000167 def test_iterators(self):
168 # Make sure unicode objects have an __iter__ method
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000169 it = "\u1111\u2222\u3333".__iter__()
170 self.assertEqual(next(it), "\u1111")
171 self.assertEqual(next(it), "\u2222")
172 self.assertEqual(next(it), "\u3333")
Georg Brandla18af4e2007-04-21 15:47:16 +0000173 self.assertRaises(StopIteration, next, it)
Guido van Rossum49d6b072006-08-17 21:11:47 +0000174
Walter Dörwald28256f22003-01-19 16:59:20 +0000175 def test_count(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000176 string_tests.CommonTest.test_count(self)
177 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000178 self.checkequalnofix(3, 'aaa', 'count', 'a')
179 self.checkequalnofix(0, 'aaa', 'count', 'b')
180 self.checkequalnofix(3, 'aaa', 'count', 'a')
181 self.checkequalnofix(0, 'aaa', 'count', 'b')
182 self.checkequalnofix(0, 'aaa', 'count', 'b')
183 self.checkequalnofix(1, 'aaa', 'count', 'a', -1)
184 self.checkequalnofix(3, 'aaa', 'count', 'a', -10)
185 self.checkequalnofix(2, 'aaa', 'count', 'a', 0, -1)
186 self.checkequalnofix(0, 'aaa', 'count', 'a', 0, -10)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200187 # test mixed kinds
188 self.checkequal(10, '\u0102' + 'a' * 10, 'count', 'a')
189 self.checkequal(10, '\U00100304' + 'a' * 10, 'count', 'a')
190 self.checkequal(10, '\U00100304' + '\u0102' * 10, 'count', '\u0102')
191 self.checkequal(0, 'a' * 10, 'count', '\u0102')
192 self.checkequal(0, 'a' * 10, 'count', '\U00100304')
193 self.checkequal(0, '\u0102' * 10, 'count', '\U00100304')
194 self.checkequal(10, '\u0102' + 'a_' * 10, 'count', 'a_')
195 self.checkequal(10, '\U00100304' + 'a_' * 10, 'count', 'a_')
196 self.checkequal(10, '\U00100304' + '\u0102_' * 10, 'count', '\u0102_')
197 self.checkequal(0, 'a' * 10, 'count', 'a\u0102')
198 self.checkequal(0, 'a' * 10, 'count', 'a\U00100304')
199 self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000200
Walter Dörwald28256f22003-01-19 16:59:20 +0000201 def test_find(self):
Antoine Pitrouc0bbe7d2011-10-08 22:41:35 +0200202 string_tests.CommonTest.test_find(self)
Antoine Pitrou2c3b2302011-10-11 20:29:21 +0200203 # test implementation details of the memchr fast path
204 self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102')
205 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201')
206 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120')
207 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220')
208 self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304')
209 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204')
210 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004')
211 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000212 self.checkequalnofix(0, 'abcdefghiabc', 'find', 'abc')
213 self.checkequalnofix(9, 'abcdefghiabc', 'find', 'abc', 1)
214 self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000215
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000216 self.assertRaises(TypeError, 'hello'.find)
217 self.assertRaises(TypeError, 'hello'.find, 42)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200218 # test mixed kinds
219 self.checkequal(100, '\u0102' * 100 + 'a', 'find', 'a')
220 self.checkequal(100, '\U00100304' * 100 + 'a', 'find', 'a')
221 self.checkequal(100, '\U00100304' * 100 + '\u0102', 'find', '\u0102')
222 self.checkequal(-1, 'a' * 100, 'find', '\u0102')
223 self.checkequal(-1, 'a' * 100, 'find', '\U00100304')
224 self.checkequal(-1, '\u0102' * 100, 'find', '\U00100304')
225 self.checkequal(100, '\u0102' * 100 + 'a_', 'find', 'a_')
226 self.checkequal(100, '\U00100304' * 100 + 'a_', 'find', 'a_')
227 self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'find', '\u0102_')
228 self.checkequal(-1, 'a' * 100, 'find', 'a\u0102')
229 self.checkequal(-1, 'a' * 100, 'find', 'a\U00100304')
230 self.checkequal(-1, '\u0102' * 100, 'find', '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000231
Walter Dörwald28256f22003-01-19 16:59:20 +0000232 def test_rfind(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000233 string_tests.CommonTest.test_rfind(self)
Antoine Pitrou2c3b2302011-10-11 20:29:21 +0200234 # test implementation details of the memrchr fast path
235 self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102')
236 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201')
237 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120')
238 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220')
239 self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304')
240 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204')
241 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000242 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000243 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', 'abc')
244 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
245 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200246 # test mixed kinds
247 self.checkequal(0, 'a' + '\u0102' * 100, 'rfind', 'a')
248 self.checkequal(0, 'a' + '\U00100304' * 100, 'rfind', 'a')
249 self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rfind', '\u0102')
250 self.checkequal(-1, 'a' * 100, 'rfind', '\u0102')
251 self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304')
252 self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304')
253 self.checkequal(0, '_a' + '\u0102' * 100, 'rfind', '_a')
254 self.checkequal(0, '_a' + '\U00100304' * 100, 'rfind', '_a')
255 self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rfind', '_\u0102')
256 self.checkequal(-1, 'a' * 100, 'rfind', '\u0102a')
257 self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304a')
258 self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304\u0102')
Guido van Rossum8b264542000-12-19 02:22:31 +0000259
Walter Dörwald28256f22003-01-19 16:59:20 +0000260 def test_index(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000261 string_tests.CommonTest.test_index(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000262 self.checkequalnofix(0, 'abcdefghiabc', 'index', '')
263 self.checkequalnofix(3, 'abcdefghiabc', 'index', 'def')
264 self.checkequalnofix(0, 'abcdefghiabc', 'index', 'abc')
265 self.checkequalnofix(9, 'abcdefghiabc', 'index', 'abc', 1)
266 self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
267 self.assertRaises(ValueError, 'abcdefghiab'.index, 'abc', 1)
268 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', 8)
269 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', -1)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200270 # test mixed kinds
271 self.checkequal(100, '\u0102' * 100 + 'a', 'index', 'a')
272 self.checkequal(100, '\U00100304' * 100 + 'a', 'index', 'a')
273 self.checkequal(100, '\U00100304' * 100 + '\u0102', 'index', '\u0102')
274 self.assertRaises(ValueError, ('a' * 100).index, '\u0102')
275 self.assertRaises(ValueError, ('a' * 100).index, '\U00100304')
276 self.assertRaises(ValueError, ('\u0102' * 100).index, '\U00100304')
277 self.checkequal(100, '\u0102' * 100 + 'a_', 'index', 'a_')
278 self.checkequal(100, '\U00100304' * 100 + 'a_', 'index', 'a_')
279 self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'index', '\u0102_')
280 self.assertRaises(ValueError, ('a' * 100).index, 'a\u0102')
281 self.assertRaises(ValueError, ('a' * 100).index, 'a\U00100304')
282 self.assertRaises(ValueError, ('\u0102' * 100).index, '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000283
Walter Dörwald28256f22003-01-19 16:59:20 +0000284 def test_rindex(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000285 string_tests.CommonTest.test_rindex(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000286 self.checkequalnofix(12, 'abcdefghiabc', 'rindex', '')
287 self.checkequalnofix(3, 'abcdefghiabc', 'rindex', 'def')
288 self.checkequalnofix(9, 'abcdefghiabc', 'rindex', 'abc')
289 self.checkequalnofix(0, 'abcdefghiabc', 'rindex', 'abc', 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000290
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000291 self.assertRaises(ValueError, 'abcdefghiabc'.rindex, 'hib')
292 self.assertRaises(ValueError, 'defghiabc'.rindex, 'def', 1)
293 self.assertRaises(ValueError, 'defghiabc'.rindex, 'abc', 0, -1)
294 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, 8)
295 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, -1)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200296 # test mixed kinds
297 self.checkequal(0, 'a' + '\u0102' * 100, 'rindex', 'a')
298 self.checkequal(0, 'a' + '\U00100304' * 100, 'rindex', 'a')
299 self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rindex', '\u0102')
300 self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102')
301 self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304')
302 self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304')
303 self.checkequal(0, '_a' + '\u0102' * 100, 'rindex', '_a')
304 self.checkequal(0, '_a' + '\U00100304' * 100, 'rindex', '_a')
305 self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rindex', '_\u0102')
306 self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102a')
307 self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304a')
308 self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304\u0102')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000309
Georg Brandlceee0772007-11-27 23:48:05 +0000310 def test_maketrans_translate(self):
311 # these work with plain translate()
312 self.checkequalnofix('bbbc', 'abababc', 'translate',
313 {ord('a'): None})
314 self.checkequalnofix('iiic', 'abababc', 'translate',
315 {ord('a'): None, ord('b'): ord('i')})
316 self.checkequalnofix('iiix', 'abababc', 'translate',
317 {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
318 self.checkequalnofix('c', 'abababc', 'translate',
319 {ord('a'): None, ord('b'): ''})
320 self.checkequalnofix('xyyx', 'xzx', 'translate',
321 {ord('z'): 'yy'})
Victor Stinner5a29f252014-04-05 00:17:51 +0200322
Georg Brandlceee0772007-11-27 23:48:05 +0000323 # this needs maketrans()
324 self.checkequalnofix('abababc', 'abababc', 'translate',
325 {'b': '<i>'})
326 tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
327 self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
328 # test alternative way of calling maketrans()
329 tbl = self.type2test.maketrans('abc', 'xyz', 'd')
330 self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
331
Victor Stinner5a29f252014-04-05 00:17:51 +0200332 # various tests switching from ASCII to latin1 or the opposite;
333 # same length, remove a letter, or replace with a longer string.
334 self.assertEqual("[a]".translate(str.maketrans('a', 'X')),
335 "[X]")
336 self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})),
337 "[X]")
338 self.assertEqual("[a]".translate(str.maketrans({'a': None})),
339 "[]")
340 self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})),
341 "[XXX]")
342 self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})),
343 "[\xe9]")
Victor Stinner33798672016-03-01 21:59:58 +0100344 self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '123'})),
345 "x123")
346 self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '\xe9'})),
347 "x\xe9")
348
349 # test non-ASCII (don't take the fast-path)
Victor Stinner5a29f252014-04-05 00:17:51 +0200350 self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})),
351 "[<\xe9>]")
352 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})),
353 "[a]")
354 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})),
355 "[]")
Victor Stinner33798672016-03-01 21:59:58 +0100356 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': '123'})),
357 "[123]")
358 self.assertEqual("[a\xe9]".translate(str.maketrans({'a': '<\u20ac>'})),
359 "[<\u20ac>\xe9]")
Victor Stinner5a29f252014-04-05 00:17:51 +0200360
Victor Stinner4ff33af2014-04-05 11:56:37 +0200361 # invalid Unicode characters
362 invalid_char = 0x10ffff+1
363 for before in "a\xe9\u20ac\U0010ffff":
364 mapping = str.maketrans({before: invalid_char})
365 text = "[%s]" % before
366 self.assertRaises(ValueError, text.translate, mapping)
367
368 # errors
Georg Brandlceee0772007-11-27 23:48:05 +0000369 self.assertRaises(TypeError, self.type2test.maketrans)
370 self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
371 self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
372 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
373 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
374 self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
375 self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000376
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000377 self.assertRaises(TypeError, 'hello'.translate)
Walter Dörwald67e83882007-05-05 12:26:27 +0000378 self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000379
Walter Dörwald28256f22003-01-19 16:59:20 +0000380 def test_split(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000381 string_tests.CommonTest.test_split(self)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000382
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200383 # test mixed kinds
384 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
385 left *= 9
386 right *= 9
387 for delim in ('c', '\u0102', '\U00010302'):
388 self.checkequal([left + right],
389 left + right, 'split', delim)
390 self.checkequal([left, right],
391 left + delim + right, 'split', delim)
392 self.checkequal([left + right],
393 left + right, 'split', delim * 2)
394 self.checkequal([left, right],
395 left + delim * 2 + right, 'split', delim *2)
396
397 def test_rsplit(self):
398 string_tests.CommonTest.test_rsplit(self)
399 # test mixed kinds
400 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
401 left *= 9
402 right *= 9
403 for delim in ('c', '\u0102', '\U00010302'):
404 self.checkequal([left + right],
405 left + right, 'rsplit', delim)
406 self.checkequal([left, right],
407 left + delim + right, 'rsplit', delim)
408 self.checkequal([left + right],
409 left + right, 'rsplit', delim * 2)
410 self.checkequal([left, right],
411 left + delim * 2 + right, 'rsplit', delim *2)
412
413 def test_partition(self):
414 string_tests.MixinStrUnicodeUserStringTest.test_partition(self)
415 # test mixed kinds
Serhiy Storchaka48070c12015-03-29 19:21:02 +0300416 self.checkequal(('ABCDEFGH', '', ''), 'ABCDEFGH', 'partition', '\u4200')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200417 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
418 left *= 9
419 right *= 9
420 for delim in ('c', '\u0102', '\U00010302'):
421 self.checkequal((left + right, '', ''),
422 left + right, 'partition', delim)
423 self.checkequal((left, delim, right),
424 left + delim + right, 'partition', delim)
425 self.checkequal((left + right, '', ''),
426 left + right, 'partition', delim * 2)
427 self.checkequal((left, delim * 2, right),
428 left + delim * 2 + right, 'partition', delim * 2)
429
430 def test_rpartition(self):
431 string_tests.MixinStrUnicodeUserStringTest.test_rpartition(self)
432 # test mixed kinds
Serhiy Storchaka48070c12015-03-29 19:21:02 +0300433 self.checkequal(('', '', 'ABCDEFGH'), 'ABCDEFGH', 'rpartition', '\u4200')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200434 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
435 left *= 9
436 right *= 9
437 for delim in ('c', '\u0102', '\U00010302'):
438 self.checkequal(('', '', left + right),
439 left + right, 'rpartition', delim)
440 self.checkequal((left, delim, right),
441 left + delim + right, 'rpartition', delim)
442 self.checkequal(('', '', left + right),
443 left + right, 'rpartition', delim * 2)
444 self.checkequal((left, delim * 2, right),
445 left + delim * 2 + right, 'rpartition', delim * 2)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000446
Walter Dörwald28256f22003-01-19 16:59:20 +0000447 def test_join(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000448 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000449
Guido van Rossumf1044292007-09-27 18:01:22 +0000450 class MyWrapper:
451 def __init__(self, sval): self.sval = sval
452 def __str__(self): return self.sval
453
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000454 # mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000455 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
456 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
457 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
458 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
459 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
460 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
461 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
Guido van Rossum98297ee2007-11-06 21:34:58 +0000462 self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
463 self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
464 self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
465 self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
Marc-André Lemburge5034372000-08-08 08:04:29 +0000466
Walter Dörwald28256f22003-01-19 16:59:20 +0000467 def test_replace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000468 string_tests.CommonTest.test_replace(self)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000469
Walter Dörwald28256f22003-01-19 16:59:20 +0000470 # method call forwarded from str implementation because of unicode argument
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000471 self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
472 self.assertRaises(TypeError, 'replace'.replace, "r", 42)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200473 # test mixed kinds
474 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
475 left *= 9
476 right *= 9
477 for delim in ('c', '\u0102', '\U00010302'):
478 for repl in ('d', '\u0103', '\U00010303'):
479 self.checkequal(left + right,
480 left + right, 'replace', delim, repl)
481 self.checkequal(left + repl + right,
482 left + delim + right,
483 'replace', delim, repl)
484 self.checkequal(left + right,
485 left + right, 'replace', delim * 2, repl)
486 self.checkequal(left + repl + right,
487 left + delim * 2 + right,
488 'replace', delim * 2, repl)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000489
Victor Stinner59de0ee2011-10-07 10:01:28 +0200490 @support.cpython_only
491 def test_replace_id(self):
Victor Stinner1d972ad2011-10-07 13:31:46 +0200492 pattern = 'abc'
493 text = 'abc def'
494 self.assertIs(text.replace(pattern, pattern), text)
Victor Stinner59de0ee2011-10-07 10:01:28 +0200495
Guido van Rossum98297ee2007-11-06 21:34:58 +0000496 def test_bytes_comparison(self):
Brett Cannon226b2302010-03-20 22:22:22 +0000497 with support.check_warnings():
498 warnings.simplefilter('ignore', BytesWarning)
499 self.assertEqual('abc' == b'abc', False)
500 self.assertEqual('abc' != b'abc', True)
501 self.assertEqual('abc' == bytearray(b'abc'), False)
502 self.assertEqual('abc' != bytearray(b'abc'), True)
Brett Cannon40430012007-10-22 20:24:51 +0000503
Walter Dörwald28256f22003-01-19 16:59:20 +0000504 def test_comparison(self):
505 # Comparisons:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000506 self.assertEqual('abc', 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000507 self.assertTrue('abcd' > 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000508 self.assertTrue('abc' < 'abcd')
Walter Dörwald28256f22003-01-19 16:59:20 +0000509
510 if 0:
511 # Move these tests to a Unicode collation module test...
512 # Testing UTF-16 code point order comparisons...
513
514 # No surrogates, no fixup required.
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000515 self.assertTrue('\u0061' < '\u20ac')
Walter Dörwald28256f22003-01-19 16:59:20 +0000516 # Non surrogate below surrogate value, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000517 self.assertTrue('\u0061' < '\ud800\udc02')
Walter Dörwald28256f22003-01-19 16:59:20 +0000518
519 # Non surrogate above surrogate value, fixup required
520 def test_lecmp(s, s2):
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000521 self.assertTrue(s < s2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000522
523 def test_fixup(s):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000524 s2 = '\ud800\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000525 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000526 s2 = '\ud900\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000527 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000528 s2 = '\uda00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000529 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000530 s2 = '\udb00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000531 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000532 s2 = '\ud800\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000533 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000534 s2 = '\ud900\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000535 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000536 s2 = '\uda00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000537 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000538 s2 = '\udb00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000539 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000540 s2 = '\ud800\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000541 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000542 s2 = '\ud900\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000543 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000544 s2 = '\uda00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000545 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000546 s2 = '\udb00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000547 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000548 s2 = '\ud800\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000549 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000550 s2 = '\ud900\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000551 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000552 s2 = '\uda00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000553 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000554 s2 = '\udb00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000555 test_lecmp(s, s2)
556
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000557 test_fixup('\ue000')
558 test_fixup('\uff61')
Walter Dörwald28256f22003-01-19 16:59:20 +0000559
560 # Surrogates on both sides, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000561 self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
Walter Dörwald28256f22003-01-19 16:59:20 +0000562
Walter Dörwald28256f22003-01-19 16:59:20 +0000563 def test_islower(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000564 super().test_islower()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000565 self.checkequalnofix(False, '\u1FFc', 'islower')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500566 self.assertFalse('\u2167'.islower())
567 self.assertTrue('\u2177'.islower())
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300568 # non-BMP, uppercase
569 self.assertFalse('\U00010401'.islower())
570 self.assertFalse('\U00010427'.islower())
571 # non-BMP, lowercase
572 self.assertTrue('\U00010429'.islower())
573 self.assertTrue('\U0001044E'.islower())
574 # non-BMP, non-cased
575 self.assertFalse('\U0001F40D'.islower())
576 self.assertFalse('\U0001F46F'.islower())
Walter Dörwald28256f22003-01-19 16:59:20 +0000577
578 def test_isupper(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000579 super().test_isupper()
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000580 if not sys.platform.startswith('java'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000581 self.checkequalnofix(False, '\u1FFc', 'isupper')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500582 self.assertTrue('\u2167'.isupper())
583 self.assertFalse('\u2177'.isupper())
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300584 # non-BMP, uppercase
585 self.assertTrue('\U00010401'.isupper())
586 self.assertTrue('\U00010427'.isupper())
587 # non-BMP, lowercase
588 self.assertFalse('\U00010429'.isupper())
589 self.assertFalse('\U0001044E'.isupper())
590 # non-BMP, non-cased
591 self.assertFalse('\U0001F40D'.isupper())
592 self.assertFalse('\U0001F46F'.isupper())
Walter Dörwald28256f22003-01-19 16:59:20 +0000593
594 def test_istitle(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000595 super().test_istitle()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000596 self.checkequalnofix(True, '\u1FFc', 'istitle')
597 self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
Walter Dörwald28256f22003-01-19 16:59:20 +0000598
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300599 # non-BMP, uppercase + lowercase
600 self.assertTrue('\U00010401\U00010429'.istitle())
601 self.assertTrue('\U00010427\U0001044E'.istitle())
602 # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
603 for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
604 self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
605
Walter Dörwald28256f22003-01-19 16:59:20 +0000606 def test_isspace(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000607 super().test_isspace()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000608 self.checkequalnofix(True, '\u2000', 'isspace')
609 self.checkequalnofix(True, '\u200a', 'isspace')
610 self.checkequalnofix(False, '\u2014', 'isspace')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300611 # apparently there are no non-BMP spaces chars in Unicode 6
612 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
613 '\U0001F40D', '\U0001F46F']:
614 self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
615
616 def test_isalnum(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000617 super().test_isalnum()
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300618 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
619 '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
620 self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
Walter Dörwald28256f22003-01-19 16:59:20 +0000621
622 def test_isalpha(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000623 super().test_isalpha()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000624 self.checkequalnofix(True, '\u1FFc', 'isalpha')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300625 # non-BMP, cased
626 self.assertTrue('\U00010401'.isalpha())
627 self.assertTrue('\U00010427'.isalpha())
628 self.assertTrue('\U00010429'.isalpha())
629 self.assertTrue('\U0001044E'.isalpha())
630 # non-BMP, non-cased
631 self.assertFalse('\U0001F40D'.isalpha())
632 self.assertFalse('\U0001F46F'.isalpha())
Walter Dörwald28256f22003-01-19 16:59:20 +0000633
634 def test_isdecimal(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000635 self.checkequalnofix(False, '', 'isdecimal')
636 self.checkequalnofix(False, 'a', 'isdecimal')
637 self.checkequalnofix(True, '0', 'isdecimal')
638 self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
639 self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
640 self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
641 self.checkequalnofix(True, '0123456789', 'isdecimal')
642 self.checkequalnofix(False, '0123456789a', 'isdecimal')
Walter Dörwald28256f22003-01-19 16:59:20 +0000643
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000644 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000645
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300646 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
647 '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
648 self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
649 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
650 self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
651
Walter Dörwald28256f22003-01-19 16:59:20 +0000652 def test_isdigit(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000653 super().test_isdigit()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000654 self.checkequalnofix(True, '\u2460', 'isdigit')
655 self.checkequalnofix(False, '\xbc', 'isdigit')
656 self.checkequalnofix(True, '\u0660', 'isdigit')
Walter Dörwald28256f22003-01-19 16:59:20 +0000657
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300658 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
659 '\U0001F40D', '\U0001F46F', '\U00011065']:
660 self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
661 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
662 self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
663
Walter Dörwald28256f22003-01-19 16:59:20 +0000664 def test_isnumeric(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000665 self.checkequalnofix(False, '', 'isnumeric')
666 self.checkequalnofix(False, 'a', 'isnumeric')
667 self.checkequalnofix(True, '0', 'isnumeric')
668 self.checkequalnofix(True, '\u2460', 'isnumeric')
669 self.checkequalnofix(True, '\xbc', 'isnumeric')
670 self.checkequalnofix(True, '\u0660', 'isnumeric')
671 self.checkequalnofix(True, '0123456789', 'isnumeric')
672 self.checkequalnofix(False, '0123456789a', 'isnumeric')
Walter Dörwald28256f22003-01-19 16:59:20 +0000673
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000674 self.assertRaises(TypeError, "abc".isnumeric, 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000675
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300676 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
677 '\U0001F40D', '\U0001F46F']:
678 self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
679 for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
680 '\U000104A0', '\U0001F107']:
681 self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
682
Martin v. Löwis47383402007-08-15 07:32:56 +0000683 def test_isidentifier(self):
684 self.assertTrue("a".isidentifier())
685 self.assertTrue("Z".isidentifier())
686 self.assertTrue("_".isidentifier())
687 self.assertTrue("b0".isidentifier())
688 self.assertTrue("bc".isidentifier())
689 self.assertTrue("b_".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000690 self.assertTrue("µ".isidentifier())
Benjamin Petersonf413b802011-08-12 22:17:18 -0500691 self.assertTrue("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000692
693 self.assertFalse(" ".isidentifier())
694 self.assertFalse("[".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000695 self.assertFalse("©".isidentifier())
Georg Brandld52429f2008-07-04 15:55:02 +0000696 self.assertFalse("0".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000697
Georg Brandl559e5d72008-06-11 18:37:52 +0000698 def test_isprintable(self):
699 self.assertTrue("".isprintable())
Benjamin Peterson09832742009-03-26 17:15:46 +0000700 self.assertTrue(" ".isprintable())
Georg Brandl559e5d72008-06-11 18:37:52 +0000701 self.assertTrue("abcdefg".isprintable())
702 self.assertFalse("abcdefg\n".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000703 # some defined Unicode character
704 self.assertTrue("\u0374".isprintable())
705 # undefined character
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +0000706 self.assertFalse("\u0378".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000707 # single surrogate character
Georg Brandl559e5d72008-06-11 18:37:52 +0000708 self.assertFalse("\ud800".isprintable())
709
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300710 self.assertTrue('\U0001F46F'.isprintable())
711 self.assertFalse('\U000E0020'.isprintable())
712
713 def test_surrogates(self):
714 for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
715 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
716 self.assertTrue(s.islower())
717 self.assertFalse(s.isupper())
718 self.assertFalse(s.istitle())
719 for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
720 'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
721 self.assertFalse(s.islower())
722 self.assertTrue(s.isupper())
723 self.assertTrue(s.istitle())
724
725 for meth_name in ('islower', 'isupper', 'istitle'):
726 meth = getattr(str, meth_name)
727 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
728 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
729
730 for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
731 'isdecimal', 'isnumeric',
732 'isidentifier', 'isprintable'):
733 meth = getattr(str, meth_name)
734 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
735 'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
736 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
737 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
738
739
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300740 def test_lower(self):
741 string_tests.CommonTest.test_lower(self)
742 self.assertEqual('\U00010427'.lower(), '\U0001044F')
743 self.assertEqual('\U00010427\U00010427'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300744 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300745 self.assertEqual('\U00010427\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300746 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300747 self.assertEqual('X\U00010427x\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300748 'x\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500749 self.assertEqual('fi'.lower(), 'fi')
750 self.assertEqual('\u0130'.lower(), '\u0069\u0307')
751 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
752 self.assertEqual('\u03a3'.lower(), '\u03c3')
753 self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
754 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
755 self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
756 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
757 self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
758 self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
759 self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
760 self.assertEqual('\u2177'.lower(), '\u2177')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300761
Benjamin Petersond5890c82012-01-14 13:23:30 -0500762 def test_casefold(self):
763 self.assertEqual('hello'.casefold(), 'hello')
764 self.assertEqual('hELlo'.casefold(), 'hello')
765 self.assertEqual('ß'.casefold(), 'ss')
766 self.assertEqual('fi'.casefold(), 'fi')
767 self.assertEqual('\u03a3'.casefold(), '\u03c3')
768 self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
Benjamin Peterson4eda9372012-08-05 15:05:34 -0700769 self.assertEqual('\u00b5'.casefold(), '\u03bc')
Benjamin Petersond5890c82012-01-14 13:23:30 -0500770
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300771 def test_upper(self):
772 string_tests.CommonTest.test_upper(self)
773 self.assertEqual('\U0001044F'.upper(), '\U00010427')
774 self.assertEqual('\U0001044F\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300775 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300776 self.assertEqual('\U00010427\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300777 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300778 self.assertEqual('X\U00010427x\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300779 'X\U00010427X\U00010427')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500780 self.assertEqual('fi'.upper(), 'FI')
781 self.assertEqual('\u0130'.upper(), '\u0130')
782 self.assertEqual('\u03a3'.upper(), '\u03a3')
783 self.assertEqual('ß'.upper(), 'SS')
784 self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
785 self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
786 self.assertEqual('\u2177'.upper(), '\u2167')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300787
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300788 def test_capitalize(self):
789 string_tests.CommonTest.test_capitalize(self)
790 self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
791 self.assertEqual('\U0001044F\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300792 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300793 self.assertEqual('\U00010427\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300794 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300795 self.assertEqual('\U0001044F\U00010427'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300796 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300797 self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300798 'X\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500799 self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
800 exp = '\u0399\u0308\u0300\u0069\u0307'
801 self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
802 self.assertEqual('finnish'.capitalize(), 'FInnish')
803 self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300804
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300805 def test_title(self):
Martin Panter152a19c2016-04-06 06:37:17 +0000806 super().test_title()
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300807 self.assertEqual('\U0001044F'.title(), '\U00010427')
808 self.assertEqual('\U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300809 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300810 self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300811 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300812 self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300813 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300814 self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300815 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300816 self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300817 'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500818 self.assertEqual('fiNNISH'.title(), 'Finnish')
819 self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
820 self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300821
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300822 def test_swapcase(self):
823 string_tests.CommonTest.test_swapcase(self)
824 self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
825 self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
826 self.assertEqual('\U0001044F\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300827 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300828 self.assertEqual('\U00010427\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300829 '\U0001044F\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300830 self.assertEqual('\U0001044F\U00010427'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300831 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300832 self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300833 'x\U0001044FX\U00010427')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500834 self.assertEqual('fi'.swapcase(), 'FI')
835 self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
836 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
837 self.assertEqual('\u03a3'.swapcase(), '\u03c3')
838 self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
839 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
840 self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
841 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
842 self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
843 self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
844 self.assertEqual('\u03a3'.swapcase(), '\u03c3')
845 self.assertEqual('ß'.swapcase(), 'SS')
846 self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300847
Ezio Melottif84e01d2013-07-08 17:48:29 +0200848 def test_center(self):
849 string_tests.CommonTest.test_center(self)
850 self.assertEqual('x'.center(2, '\U0010FFFF'),
851 'x\U0010FFFF')
852 self.assertEqual('x'.center(3, '\U0010FFFF'),
853 '\U0010FFFFx\U0010FFFF')
854 self.assertEqual('x'.center(4, '\U0010FFFF'),
855 '\U0010FFFFx\U0010FFFF\U0010FFFF')
856
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400857 @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system")
Benjamin Peterson4d856892014-10-15 13:39:46 -0400858 @support.cpython_only
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400859 def test_case_operation_overflow(self):
860 # Issue #22643
Serhiy Storchaka411dfd82015-11-07 16:54:48 +0200861 size = 2**32//12 + 1
862 try:
863 s = "ü" * size
864 except MemoryError:
865 self.skipTest('no enough memory (%.0f MiB required)' % (size / 2**20))
866 try:
867 self.assertRaises(OverflowError, s.upper)
868 finally:
869 del s
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400870
Walter Dörwald28256f22003-01-19 16:59:20 +0000871 def test_contains(self):
872 # Testing Unicode contains method
Benjamin Peterson577473f2010-01-19 00:09:57 +0000873 self.assertIn('a', 'abdb')
874 self.assertIn('a', 'bdab')
875 self.assertIn('a', 'bdaba')
876 self.assertIn('a', 'bdba')
877 self.assertNotIn('a', 'bdb')
878 self.assertIn('a', 'bdba')
879 self.assertIn('a', ('a',1,None))
880 self.assertIn('a', (1,None,'a'))
881 self.assertIn('a', ('a',1,None))
882 self.assertIn('a', (1,None,'a'))
883 self.assertNotIn('a', ('x',1,'y'))
884 self.assertNotIn('a', ('x',1,None))
885 self.assertNotIn('abcd', 'abcxxxx')
886 self.assertIn('ab', 'abcd')
887 self.assertIn('ab', 'abc')
888 self.assertIn('ab', (1,None,'ab'))
889 self.assertIn('', 'abc')
890 self.assertIn('', '')
891 self.assertIn('', 'abc')
892 self.assertNotIn('\0', 'abc')
893 self.assertIn('\0', '\0abc')
894 self.assertIn('\0', 'abc\0')
895 self.assertIn('a', '\0abc')
896 self.assertIn('asdf', 'asdf')
897 self.assertNotIn('asdf', 'asd')
898 self.assertNotIn('asdf', '')
Walter Dörwald28256f22003-01-19 16:59:20 +0000899
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000900 self.assertRaises(TypeError, "abc".__contains__)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200901 # test mixed kinds
902 for fill in ('a', '\u0100', '\U00010300'):
903 fill *= 9
904 for delim in ('c', '\u0102', '\U00010302'):
905 self.assertNotIn(delim, fill)
906 self.assertIn(delim, fill + delim)
907 self.assertNotIn(delim * 2, fill)
908 self.assertIn(delim * 2, fill + delim * 2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000909
Serhiy Storchaka31b1c8b2013-06-12 09:20:44 +0300910 def test_issue18183(self):
911 '\U00010000\U00100000'.lower()
912 '\U00010000\U00100000'.casefold()
913 '\U00010000\U00100000'.upper()
914 '\U00010000\U00100000'.capitalize()
915 '\U00010000\U00100000'.title()
916 '\U00010000\U00100000'.swapcase()
917 '\U00100000'.center(3, '\U00010000')
918 '\U00100000'.ljust(3, '\U00010000')
919 '\U00100000'.rjust(3, '\U00010000')
920
Eric Smith8c663262007-08-25 02:26:07 +0000921 def test_format(self):
922 self.assertEqual(''.format(), '')
923 self.assertEqual('a'.format(), 'a')
924 self.assertEqual('ab'.format(), 'ab')
925 self.assertEqual('a{{'.format(), 'a{')
926 self.assertEqual('a}}'.format(), 'a}')
927 self.assertEqual('{{b'.format(), '{b')
928 self.assertEqual('}}b'.format(), '}b')
929 self.assertEqual('a{{b'.format(), 'a{b')
930
931 # examples from the PEP:
932 import datetime
933 self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
934 self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
935 "My name is Fred")
936 self.assertEqual("My name is {0} :-{{}}".format('Fred'),
937 "My name is Fred :-{}")
938
939 d = datetime.date(2007, 8, 18)
940 self.assertEqual("The year is {0.year}".format(d),
941 "The year is 2007")
942
Eric Smith8c663262007-08-25 02:26:07 +0000943 # classes we'll use for testing
944 class C:
945 def __init__(self, x=100):
946 self._x = x
947 def __format__(self, spec):
948 return spec
949
950 class D:
951 def __init__(self, x):
952 self.x = x
953 def __format__(self, spec):
954 return str(self.x)
955
956 # class with __str__, but no __format__
957 class E:
958 def __init__(self, x):
959 self.x = x
960 def __str__(self):
961 return 'E(' + self.x + ')'
962
963 # class with __repr__, but no __format__ or __str__
964 class F:
965 def __init__(self, x):
966 self.x = x
967 def __repr__(self):
968 return 'F(' + self.x + ')'
969
970 # class with __format__ that forwards to string, for some format_spec's
971 class G:
972 def __init__(self, x):
973 self.x = x
974 def __str__(self):
975 return "string is " + self.x
976 def __format__(self, format_spec):
977 if format_spec == 'd':
978 return 'G(' + self.x + ')'
979 return object.__format__(self, format_spec)
980
Eric Smith739e2ad2007-08-27 19:07:22 +0000981 class I(datetime.date):
982 def __format__(self, format_spec):
983 return self.strftime(format_spec)
984
Eric Smith185e30c2007-08-30 22:23:08 +0000985 class J(int):
986 def __format__(self, format_spec):
987 return int.__format__(self * 2, format_spec)
988
Guido van Rossum97c1adf2016-08-18 09:22:23 -0700989 class M:
990 def __init__(self, x):
991 self.x = x
992 def __repr__(self):
993 return 'M(' + self.x + ')'
994 __str__ = None
995
996 class N:
997 def __init__(self, x):
998 self.x = x
999 def __repr__(self):
1000 return 'N(' + self.x + ')'
1001 __format__ = None
Eric Smith8c663262007-08-25 02:26:07 +00001002
1003 self.assertEqual(''.format(), '')
1004 self.assertEqual('abc'.format(), 'abc')
1005 self.assertEqual('{0}'.format('abc'), 'abc')
1006 self.assertEqual('{0:}'.format('abc'), 'abc')
1007# self.assertEqual('{ 0 }'.format('abc'), 'abc')
1008 self.assertEqual('X{0}'.format('abc'), 'Xabc')
1009 self.assertEqual('{0}X'.format('abc'), 'abcX')
1010 self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
1011 self.assertEqual('{1}'.format(1, 'abc'), 'abc')
1012 self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
1013 self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
1014 self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
1015 self.assertEqual('{0}'.format(-15), '-15')
1016 self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
1017 self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
1018 self.assertEqual('{{'.format(), '{')
1019 self.assertEqual('}}'.format(), '}')
1020 self.assertEqual('{{}}'.format(), '{}')
1021 self.assertEqual('{{x}}'.format(), '{x}')
1022 self.assertEqual('{{{0}}}'.format(123), '{123}')
1023 self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
1024 self.assertEqual('}}{{'.format(), '}{')
1025 self.assertEqual('}}x{{'.format(), '}x{')
1026
Eric Smith7ade6482007-08-26 22:27:13 +00001027 # weird field names
1028 self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
1029 self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
Eric Smith4cb4e4e2007-09-03 08:40:29 +00001030 self.assertEqual("{0[ ]}".format({' ':3}), '3')
Eric Smith7ade6482007-08-26 22:27:13 +00001031
Eric Smith8c663262007-08-25 02:26:07 +00001032 self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
1033 self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
1034 self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
1035 self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
1036 self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
1037 self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
1038 self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
1039
Eric Smith8c663262007-08-25 02:26:07 +00001040 # strings
1041 self.assertEqual('{0:.3s}'.format('abc'), 'abc')
1042 self.assertEqual('{0:.3s}'.format('ab'), 'ab')
1043 self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
1044 self.assertEqual('{0:.0s}'.format('abcdef'), '')
1045 self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
1046 self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
1047 self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
1048 self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
1049 self.assertEqual('{0:x<0s}'.format('result'), 'result')
1050 self.assertEqual('{0:x<5s}'.format('result'), 'result')
1051 self.assertEqual('{0:x<6s}'.format('result'), 'result')
1052 self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
1053 self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
1054 self.assertEqual('{0: <7s}'.format('result'), 'result ')
1055 self.assertEqual('{0:<7s}'.format('result'), 'result ')
1056 self.assertEqual('{0:>7s}'.format('result'), ' result')
1057 self.assertEqual('{0:>8s}'.format('result'), ' result')
1058 self.assertEqual('{0:^8s}'.format('result'), ' result ')
1059 self.assertEqual('{0:^9s}'.format('result'), ' result ')
1060 self.assertEqual('{0:^10s}'.format('result'), ' result ')
1061 self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
1062 self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
1063 self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
1064
Eric V. Smith2ea97122014-04-14 11:55:10 -04001065 # issue 12546: use \x00 as a fill character
1066 self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
1067 self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
1068 self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
1069 self.assertEqual('{0:^6s}'.format('foo'), ' foo ')
1070
1071 self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
1072 self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
1073 self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
1074 self.assertEqual('{0:<6}'.format(3), '3 ')
1075
1076 self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
1077 self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
1078 self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
1079 self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
1080
1081 self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
1082 self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
1083 self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
1084 self.assertEqual('{0:^12}'.format(3+2.0j), ' (3+2j) ')
1085
Eric Smith8c663262007-08-25 02:26:07 +00001086 # format specifiers for user defined type
1087 self.assertEqual('{0:abc}'.format(C()), 'abc')
1088
Georg Brandld52429f2008-07-04 15:55:02 +00001089 # !r, !s and !a coercions
Eric Smith8c663262007-08-25 02:26:07 +00001090 self.assertEqual('{0!s}'.format('Hello'), 'Hello')
1091 self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
1092 self.assertEqual('{0!s:15}'.format('Hello'), 'Hello ')
1093 self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello ')
1094 self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
1095 self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
1096 self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +00001097 self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +00001098 self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'") # printable
1099 self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
Georg Brandl559e5d72008-06-11 18:37:52 +00001100 self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +00001101 self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +00001102 self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
Georg Brandl559e5d72008-06-11 18:37:52 +00001103 self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
1104 self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
Georg Brandld52429f2008-07-04 15:55:02 +00001105 self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
Eric Smith8c663262007-08-25 02:26:07 +00001106
Eric Smith8c663262007-08-25 02:26:07 +00001107 # test fallback to object.__format__
1108 self.assertEqual('{0}'.format({}), '{}')
1109 self.assertEqual('{0}'.format([]), '[]')
1110 self.assertEqual('{0}'.format([1]), '[1]')
Eric Smithe4d63172010-09-13 20:48:43 +00001111
Eric Smith8c663262007-08-25 02:26:07 +00001112 self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
Eric Smith8c663262007-08-25 02:26:07 +00001113 self.assertEqual('{0!s}'.format(G('data')), 'string is data')
1114
Andrew Svetlov2cd8ce42012-12-23 14:27:17 +02001115 self.assertRaises(TypeError, '{0:^10}'.format, E('data'))
1116 self.assertRaises(TypeError, '{0:^10s}'.format, E('data'))
1117 self.assertRaises(TypeError, '{0:>15s}'.format, G('data'))
Eric Smithe4d63172010-09-13 20:48:43 +00001118
Eric Smith739e2ad2007-08-27 19:07:22 +00001119 self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
1120 month=8,
1121 day=27)),
1122 "date: 2007-08-27")
1123
Eric Smith185e30c2007-08-30 22:23:08 +00001124 # test deriving from a builtin type and overriding __format__
1125 self.assertEqual("{0}".format(J(10)), "20")
1126
1127
Eric Smith8c663262007-08-25 02:26:07 +00001128 # string format specifiers
1129 self.assertEqual('{0:}'.format('a'), 'a')
1130
1131 # computed format specifiers
1132 self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
1133 self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
1134 self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
1135 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello ')
1136 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello ')
1137
1138 # test various errors
1139 self.assertRaises(ValueError, '{'.format)
1140 self.assertRaises(ValueError, '}'.format)
1141 self.assertRaises(ValueError, 'a{'.format)
1142 self.assertRaises(ValueError, 'a}'.format)
1143 self.assertRaises(ValueError, '{a'.format)
1144 self.assertRaises(ValueError, '}a'.format)
Eric Smith11529192007-09-04 23:04:22 +00001145 self.assertRaises(IndexError, '{0}'.format)
1146 self.assertRaises(IndexError, '{1}'.format, 'abc')
1147 self.assertRaises(KeyError, '{x}'.format)
Eric Smith8c663262007-08-25 02:26:07 +00001148 self.assertRaises(ValueError, "}{".format)
Eric Smith8c663262007-08-25 02:26:07 +00001149 self.assertRaises(ValueError, "abc{0:{}".format)
1150 self.assertRaises(ValueError, "{0".format)
Eric Smith11529192007-09-04 23:04:22 +00001151 self.assertRaises(IndexError, "{0.}".format)
1152 self.assertRaises(ValueError, "{0.}".format, 0)
Benjamin Peterson4d944742013-05-17 18:22:31 -05001153 self.assertRaises(ValueError, "{0[}".format)
Eric Smith4cb4e4e2007-09-03 08:40:29 +00001154 self.assertRaises(ValueError, "{0[}".format, [])
Eric Smith11529192007-09-04 23:04:22 +00001155 self.assertRaises(KeyError, "{0]}".format)
1156 self.assertRaises(ValueError, "{0.[]}".format, 0)
Eric Smith7ade6482007-08-26 22:27:13 +00001157 self.assertRaises(ValueError, "{0..foo}".format, 0)
Eric Smith11529192007-09-04 23:04:22 +00001158 self.assertRaises(ValueError, "{0[0}".format, 0)
1159 self.assertRaises(ValueError, "{0[0:foo}".format, 0)
1160 self.assertRaises(KeyError, "{c]}".format)
1161 self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
1162 self.assertRaises(ValueError, "{0}}".format, 0)
1163 self.assertRaises(KeyError, "{foo}".format, bar=3)
Eric Smith8c663262007-08-25 02:26:07 +00001164 self.assertRaises(ValueError, "{0!x}".format, 3)
Eric Smith11529192007-09-04 23:04:22 +00001165 self.assertRaises(ValueError, "{0!}".format, 0)
1166 self.assertRaises(ValueError, "{0!rs}".format, 0)
Eric Smith8c663262007-08-25 02:26:07 +00001167 self.assertRaises(ValueError, "{!}".format)
Eric Smith8ec90442009-03-14 12:29:34 +00001168 self.assertRaises(IndexError, "{:}".format)
1169 self.assertRaises(IndexError, "{:s}".format)
1170 self.assertRaises(IndexError, "{}".format)
Benjamin Peterson59a1b2f2010-06-07 22:31:26 +00001171 big = "23098475029384702983476098230754973209482573"
1172 self.assertRaises(ValueError, ("{" + big + "}").format)
1173 self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
Eric Smith8c663262007-08-25 02:26:07 +00001174
Eric Smith41669ca2009-05-23 14:23:22 +00001175 # issue 6089
1176 self.assertRaises(ValueError, "{0[0]x}".format, [None])
1177 self.assertRaises(ValueError, "{0[0](10)}".format, [None])
1178
Eric Smith8c663262007-08-25 02:26:07 +00001179 # can't have a replacement on the field name portion
1180 self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
1181
1182 # exceed maximum recursion depth
1183 self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
1184 self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1185 0, 1, 2, 3, 4, 5, 6, 7)
1186
1187 # string format spec errors
1188 self.assertRaises(ValueError, "{0:-s}".format, '')
1189 self.assertRaises(ValueError, format, "", "-")
1190 self.assertRaises(ValueError, "{0:=s}".format, '')
1191
Eric Smithb1ebcc62008-07-15 13:02:41 +00001192 # Alternate formatting is not supported
1193 self.assertRaises(ValueError, format, '', '#')
1194 self.assertRaises(ValueError, format, '', '#20')
1195
Victor Stinnerece58de2012-04-23 23:36:38 +02001196 # Non-ASCII
1197 self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"),
1198 'ABC\u0410\u0411\u0412')
1199 self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"),
1200 'ABC')
1201 self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"),
1202 '')
1203
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001204 self.assertEqual("{[{}]}".format({"{}": 5}), "5")
Benjamin Peterson4d944742013-05-17 18:22:31 -05001205 self.assertEqual("{[{}]}".format({"{}" : "a"}), "a")
1206 self.assertEqual("{[{]}".format({"{" : "a"}), "a")
1207 self.assertEqual("{[}]}".format({"}" : "a"}), "a")
1208 self.assertEqual("{[[]}".format({"[" : "a"}), "a")
1209 self.assertEqual("{[!]}".format({"!" : "a"}), "a")
1210 self.assertRaises(ValueError, "{a{}b}".format, 42)
1211 self.assertRaises(ValueError, "{a{b}".format, 42)
1212 self.assertRaises(ValueError, "{[}".format, 42)
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001213
Benjamin Peterson0ee22bf2013-11-26 19:22:36 -06001214 self.assertEqual("0x{:0{:d}X}".format(0x0,16), "0x0000000000000000")
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001215
Guido van Rossum97c1adf2016-08-18 09:22:23 -07001216 # Blocking fallback
1217 m = M('data')
1218 self.assertEqual("{!r}".format(m), 'M(data)')
1219 self.assertRaises(TypeError, "{!s}".format, m)
1220 self.assertRaises(TypeError, "{}".format, m)
1221 n = N('data')
1222 self.assertEqual("{!r}".format(n), 'N(data)')
1223 self.assertEqual("{!s}".format(n), 'N(data)')
1224 self.assertRaises(TypeError, "{}".format, n)
1225
Eric Smith27bbca62010-11-04 17:06:58 +00001226 def test_format_map(self):
1227 self.assertEqual(''.format_map({}), '')
1228 self.assertEqual('a'.format_map({}), 'a')
1229 self.assertEqual('ab'.format_map({}), 'ab')
1230 self.assertEqual('a{{'.format_map({}), 'a{')
1231 self.assertEqual('a}}'.format_map({}), 'a}')
1232 self.assertEqual('{{b'.format_map({}), '{b')
1233 self.assertEqual('}}b'.format_map({}), '}b')
1234 self.assertEqual('a{{b'.format_map({}), 'a{b')
1235
1236 # using mappings
1237 class Mapping(dict):
1238 def __missing__(self, key):
1239 return key
1240 self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
1241 self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
1242
1243 class InternalMapping:
1244 def __init__(self):
1245 self.mapping = {'a': 'hello'}
1246 def __getitem__(self, key):
1247 return self.mapping[key]
1248 self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
1249
1250
Eric Smith27bbca62010-11-04 17:06:58 +00001251 class C:
1252 def __init__(self, x=100):
1253 self._x = x
1254 def __format__(self, spec):
1255 return spec
Eric Smith27bbca62010-11-04 17:06:58 +00001256 self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
1257
1258 # test various errors
Eric V. Smithedbb6ca2012-03-12 15:16:22 -07001259 self.assertRaises(TypeError, ''.format_map)
1260 self.assertRaises(TypeError, 'a'.format_map)
1261
1262 self.assertRaises(ValueError, '{'.format_map, {})
1263 self.assertRaises(ValueError, '}'.format_map, {})
1264 self.assertRaises(ValueError, 'a{'.format_map, {})
1265 self.assertRaises(ValueError, 'a}'.format_map, {})
1266 self.assertRaises(ValueError, '{a'.format_map, {})
1267 self.assertRaises(ValueError, '}a'.format_map, {})
Eric Smith27bbca62010-11-04 17:06:58 +00001268
Eric V. Smith12ebefc2011-07-18 14:03:41 -04001269 # issue #12579: can't supply positional params to format_map
1270 self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
1271 self.assertRaises(ValueError, '{}'.format_map, 'a')
1272 self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
1273
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001274 def test_format_huge_precision(self):
1275 format_string = ".{}f".format(sys.maxsize + 1)
1276 with self.assertRaises(ValueError):
1277 result = format(2.34, format_string)
1278
1279 def test_format_huge_width(self):
1280 format_string = "{}f".format(sys.maxsize + 1)
1281 with self.assertRaises(ValueError):
1282 result = format(2.34, format_string)
1283
1284 def test_format_huge_item_number(self):
1285 format_string = "{{{}:.6f}}".format(sys.maxsize + 1)
1286 with self.assertRaises(ValueError):
1287 result = format_string.format(2.34)
1288
Eric Smith8ec90442009-03-14 12:29:34 +00001289 def test_format_auto_numbering(self):
1290 class C:
1291 def __init__(self, x=100):
1292 self._x = x
1293 def __format__(self, spec):
1294 return spec
1295
1296 self.assertEqual('{}'.format(10), '10')
1297 self.assertEqual('{:5}'.format('s'), 's ')
1298 self.assertEqual('{!r}'.format('s'), "'s'")
1299 self.assertEqual('{._x}'.format(C(10)), '10')
1300 self.assertEqual('{[1]}'.format([1, 2]), '2')
1301 self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
1302 self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
1303
1304 self.assertEqual('a{:{}}b'.format('x', '^10'), 'a x b')
1305 self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
1306
1307 # can't mix and match numbering and auto-numbering
1308 self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
1309 self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
1310 self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
1311 self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
1312
1313 # can mix and match auto-numbering and named
1314 self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
1315 self.assertEqual('{}{f}'.format(4, f='test'), '4test')
1316 self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
1317 self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
1318
Walter Dörwald28256f22003-01-19 16:59:20 +00001319 def test_formatting(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +00001320 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
Walter Dörwald28256f22003-01-19 16:59:20 +00001321 # Testing Unicode formatting strings...
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001322 self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
1323 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000, 3.00')
1324 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000, 3.00')
1325 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000, 3.50')
1326 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000, 3.57')
1327 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
Walter Dörwald28256f22003-01-19 16:59:20 +00001328 if not sys.platform.startswith('java'):
Walter Dörwald67e83882007-05-05 12:26:27 +00001329 self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
Georg Brandl559e5d72008-06-11 18:37:52 +00001330 self.assertEqual("%r" % ("\u1234",), "'\u1234'")
1331 self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001332 self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
1333 self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
Walter Dörwald56fbcb52003-03-31 18:18:41 +00001334
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001335 self.assertEqual('%c' % 0x1234, '\u1234')
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00001336 self.assertEqual('%c' % 0x21483, '\U00021483')
1337 self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
1338 self.assertEqual('%c' % '\U00021483', '\U00021483')
1339 self.assertRaises(TypeError, "%c".__mod__, "aa")
Stefan Krah99212f62010-07-19 17:58:26 +00001340 self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -07001341 self.assertRaises(TypeError, "%i".__mod__, "aa")
Walter Dörwald28256f22003-01-19 16:59:20 +00001342
1343 # formatting jobs delegated from the string implementation:
Walter Dörwald28256f22003-01-19 16:59:20 +00001344 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001345 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1346 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1347 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1348 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1349 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1350 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
1351 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
1352 self.assertEqual('...%s...' % "abc", '...abc...')
1353 self.assertEqual('%*s' % (5,'abc',), ' abc')
1354 self.assertEqual('%*s' % (-5,'abc',), 'abc ')
1355 self.assertEqual('%*.*s' % (5,2,'abc',), ' ab')
1356 self.assertEqual('%*.*s' % (5,3,'abc',), ' abc')
1357 self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10 abc')
1358 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103 abc')
1359 self.assertEqual('%c' % 'a', 'a')
Neil Schemenauercf52c072005-08-12 17:34:58 +00001360 class Wrapper:
1361 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001362 return '\u1234'
1363 self.assertEqual('%s' % Wrapper(), '\u1234')
Walter Dörwald28256f22003-01-19 16:59:20 +00001364
Eric Smith741191f2009-05-06 13:08:15 +00001365 # issue 3382
1366 NAN = float('nan')
1367 INF = float('inf')
1368 self.assertEqual('%f' % NAN, 'nan')
1369 self.assertEqual('%F' % NAN, 'NAN')
1370 self.assertEqual('%f' % INF, 'inf')
1371 self.assertEqual('%F' % INF, 'INF')
1372
Victor Stinnerf59c28c2012-05-09 03:24:14 +02001373 # PEP 393
1374 self.assertEqual('%.1s' % "a\xe9\u20ac", 'a')
1375 self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9')
1376
Ethan Furmandf3ed242014-01-05 06:50:30 -08001377 #issue 19995
Ethan Furman9ab74802014-03-21 06:38:46 -07001378 class PseudoInt:
Ethan Furmandf3ed242014-01-05 06:50:30 -08001379 def __init__(self, value):
1380 self.value = int(value)
1381 def __int__(self):
1382 return self.value
1383 def __index__(self):
1384 return self.value
Ethan Furman9ab74802014-03-21 06:38:46 -07001385 class PseudoFloat:
Ethan Furmandf3ed242014-01-05 06:50:30 -08001386 def __init__(self, value):
1387 self.value = float(value)
1388 def __int__(self):
1389 return int(self.value)
Ethan Furman9ab74802014-03-21 06:38:46 -07001390 pi = PseudoFloat(3.1415)
1391 letter_m = PseudoInt(109)
Antoine Pitroueb168042014-01-10 00:02:38 +01001392 self.assertEqual('%x' % 42, '2a')
1393 self.assertEqual('%X' % 15, 'F')
1394 self.assertEqual('%o' % 9, '11')
1395 self.assertEqual('%c' % 109, 'm')
1396 self.assertEqual('%x' % letter_m, '6d')
1397 self.assertEqual('%X' % letter_m, '6D')
1398 self.assertEqual('%o' % letter_m, '155')
1399 self.assertEqual('%c' % letter_m, 'm')
Ethan Furman9ab74802014-03-21 06:38:46 -07001400 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14),
1401 self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11),
1402 self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79),
1403 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi),
1404 self.assertRaises(TypeError, operator.mod, '%c', pi),
Ethan Furmandf3ed242014-01-05 06:50:30 -08001405
Ethan Furmanfb137212013-08-31 10:18:55 -07001406 def test_formatting_with_enum(self):
1407 # issue18780
1408 import enum
1409 class Float(float, enum.Enum):
1410 PI = 3.1415926
1411 class Int(enum.IntEnum):
1412 IDES = 15
1413 class Str(str, enum.Enum):
1414 ABC = 'abc'
1415 # Testing Unicode formatting strings...
Ethan Furman13bdfa72013-08-31 12:48:51 -07001416 self.assertEqual("%s, %s" % (Str.ABC, Str.ABC),
1417 'Str.ABC, Str.ABC')
1418 self.assertEqual("%s, %s, %d, %i, %u, %f, %5.2f" %
1419 (Str.ABC, Str.ABC,
1420 Int.IDES, Int.IDES, Int.IDES,
1421 Float.PI, Float.PI),
1422 'Str.ABC, Str.ABC, 15, 15, 15, 3.141593, 3.14')
Ethan Furmanfb137212013-08-31 10:18:55 -07001423
1424 # formatting jobs delegated from the string implementation:
Ethan Furman13bdfa72013-08-31 12:48:51 -07001425 self.assertEqual('...%(foo)s...' % {'foo':Str.ABC},
1426 '...Str.ABC...')
1427 self.assertEqual('...%(foo)s...' % {'foo':Int.IDES},
1428 '...Int.IDES...')
1429 self.assertEqual('...%(foo)i...' % {'foo':Int.IDES},
1430 '...15...')
1431 self.assertEqual('...%(foo)d...' % {'foo':Int.IDES},
1432 '...15...')
1433 self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI},
1434 '...15...')
1435 self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123},
1436 '...3.141593...')
Ethan Furmanfb137212013-08-31 10:18:55 -07001437
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001438 def test_formatting_huge_precision(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001439 format_string = "%.{}f".format(sys.maxsize + 1)
1440 with self.assertRaises(ValueError):
1441 result = format_string % 2.34
1442
1443 @support.cpython_only
1444 def test_formatting_huge_precision_c_limits(self):
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001445 from _testcapi import INT_MAX
1446 format_string = "%.{}f".format(INT_MAX + 1)
1447 with self.assertRaises(ValueError):
1448 result = format_string % 2.34
1449
1450 def test_formatting_huge_width(self):
1451 format_string = "%{}f".format(sys.maxsize + 1)
1452 with self.assertRaises(ValueError):
1453 result = format_string % 2.34
1454
Ezio Melottiba42fd52011-04-26 06:09:45 +03001455 def test_startswith_endswith_errors(self):
1456 for meth in ('foo'.startswith, 'foo'.endswith):
Ezio Melottif2b3f782011-04-26 06:40:59 +03001457 with self.assertRaises(TypeError) as cm:
Ezio Melottiba42fd52011-04-26 06:09:45 +03001458 meth(['f'])
Ezio Melottif2b3f782011-04-26 06:40:59 +03001459 exc = str(cm.exception)
Ezio Melottiba42fd52011-04-26 06:09:45 +03001460 self.assertIn('str', exc)
1461 self.assertIn('tuple', exc)
1462
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001463 @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
Georg Brandlda6b1072006-01-20 17:48:54 +00001464 def test_format_float(self):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001465 # should not format with a comma, but always with C locale
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001466 self.assertEqual('1.0', '%.1f' % 1.0)
Georg Brandlda6b1072006-01-20 17:48:54 +00001467
Walter Dörwald28256f22003-01-19 16:59:20 +00001468 def test_constructor(self):
1469 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1470
1471 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001472 str('unicode remains unicode'),
1473 'unicode remains unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001474 )
1475
Victor Stinner07ac3eb2011-10-01 16:16:43 +02001476 for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
Serhiy Storchaka15095802015-11-25 15:47:01 +02001477 subclass = StrSubclass(text)
Victor Stinner07ac3eb2011-10-01 16:16:43 +02001478 self.assertEqual(str(subclass), text)
1479 self.assertEqual(len(subclass), len(text))
1480 if text == 'ascii':
1481 self.assertEqual(subclass.encode('ascii'), b'ascii')
1482 self.assertEqual(subclass.encode('utf-8'), b'ascii')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001483
Walter Dörwald28256f22003-01-19 16:59:20 +00001484 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001485 str('strings are converted to unicode'),
1486 'strings are converted to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001487 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001488
Walter Dörwald28256f22003-01-19 16:59:20 +00001489 class StringCompat:
1490 def __init__(self, x):
1491 self.x = x
1492 def __str__(self):
1493 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001494
Walter Dörwald28256f22003-01-19 16:59:20 +00001495 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001496 str(StringCompat('__str__ compatible objects are recognized')),
1497 '__str__ compatible objects are recognized'
Walter Dörwald28256f22003-01-19 16:59:20 +00001498 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001499
Walter Dörwald28256f22003-01-19 16:59:20 +00001500 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001501
Walter Dörwald28256f22003-01-19 16:59:20 +00001502 o = StringCompat('unicode(obj) is compatible to str()')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001503 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Walter Dörwald28256f22003-01-19 16:59:20 +00001504 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001505
Guido van Rossume2a383d2007-01-15 16:59:06 +00001506 for obj in (123, 123.45, 123):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001507 self.assertEqual(str(obj), str(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001508
Walter Dörwald28256f22003-01-19 16:59:20 +00001509 # unicode(obj, encoding, error) tests (this maps to
1510 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001511
Walter Dörwald28256f22003-01-19 16:59:20 +00001512 if not sys.platform.startswith('java'):
1513 self.assertRaises(
1514 TypeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001515 str,
1516 'decoding unicode is not supported',
Walter Dörwald28256f22003-01-19 16:59:20 +00001517 'utf-8',
1518 'strict'
1519 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001520
Walter Dörwald28256f22003-01-19 16:59:20 +00001521 self.assertEqual(
Walter Dörwald67e83882007-05-05 12:26:27 +00001522 str(b'strings are decoded to unicode', 'utf-8', 'strict'),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001523 'strings are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001524 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001525
Walter Dörwald28256f22003-01-19 16:59:20 +00001526 if not sys.platform.startswith('java'):
1527 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001528 str(
Guido van Rossumbae07c92007-10-08 02:46:15 +00001529 memoryview(b'character buffers are decoded to unicode'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001530 'utf-8',
1531 'strict'
1532 ),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001533 'character buffers are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001534 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001535
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001536 self.assertRaises(TypeError, str, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001537
Chris Jerdonek5fae0e52012-11-20 17:45:51 -08001538 def test_constructor_keyword_args(self):
1539 """Pass various keyword argument combinations to the constructor."""
1540 # The object argument can be passed as a keyword.
1541 self.assertEqual(str(object='foo'), 'foo')
1542 self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo')
1543 # The errors argument without encoding triggers "decode" mode.
1544 self.assertEqual(str(b'foo', errors='strict'), 'foo') # not "b'foo'"
1545 self.assertEqual(str(object=b'foo', errors='strict'), 'foo')
1546
1547 def test_constructor_defaults(self):
1548 """Check the constructor argument defaults."""
1549 # The object argument defaults to '' or b''.
1550 self.assertEqual(str(), '')
1551 self.assertEqual(str(errors='strict'), '')
1552 utf8_cent = '¢'.encode('utf-8')
1553 # The encoding argument defaults to utf-8.
1554 self.assertEqual(str(utf8_cent, errors='strict'), '¢')
1555 # The errors argument defaults to strict.
1556 self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii')
1557
Walter Dörwald28256f22003-01-19 16:59:20 +00001558 def test_codecs_utf7(self):
1559 utfTests = [
Walter Dörwald67e83882007-05-05 12:26:27 +00001560 ('A\u2262\u0391.', b'A+ImIDkQ.'), # RFC2152 example
1561 ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'), # RFC2152 example
1562 ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'), # RFC2152 example
1563 ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1564 ('+', b'+-'),
1565 ('+-', b'+--'),
1566 ('+?', b'+-?'),
R David Murray44b548d2016-09-08 13:59:53 -04001567 (r'\?', b'+AFw?'),
Walter Dörwald67e83882007-05-05 12:26:27 +00001568 ('+?', b'+-?'),
1569 (r'\\?', b'+AFwAXA?'),
1570 (r'\\\?', b'+AFwAXABc?'),
Antoine Pitrou244651a2009-05-04 18:56:13 +00001571 (r'++--', b'+-+---'),
1572 ('\U000abcde', b'+2m/c3g-'), # surrogate pairs
1573 ('/', b'/'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001574 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001575
Walter Dörwald28256f22003-01-19 16:59:20 +00001576 for (x, y) in utfTests:
1577 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001578
Antoine Pitrou5418ee02011-11-15 01:42:21 +01001579 # Unpaired surrogates are passed through
1580 self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
1581 self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
1582 self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
1583 self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
1584 self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
1585 self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
1586 self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
1587 self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001588
Antoine Pitrou5418ee02011-11-15 01:42:21 +01001589 self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
1590 self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001591
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001592 # Issue #2242: crash on some Windows/MSVC versions
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001593 self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')
Antoine Pitrou244651a2009-05-04 18:56:13 +00001594
1595 # Direct encoded characters
1596 set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1597 # Optional direct characters
1598 set_o = '!"#$%&*;<=>@[]^_`{|}'
1599 for c in set_d:
1600 self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1601 self.assertEqual(c.encode('ascii').decode('utf7'), c)
1602 for c in set_o:
1603 self.assertEqual(c.encode('ascii').decode('utf7'), c)
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001604
Walter Dörwald28256f22003-01-19 16:59:20 +00001605 def test_codecs_utf8(self):
Walter Dörwald67e83882007-05-05 12:26:27 +00001606 self.assertEqual(''.encode('utf-8'), b'')
1607 self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
Ezio Melottia9860ae2011-10-04 19:06:00 +03001608 self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82')
1609 self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96')
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001610 self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1611 self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
Ezio Melottia9860ae2011-10-04 19:06:00 +03001612 self.assertEqual(('\U00010002'*10).encode('utf-8'),
1613 b'\xf0\x90\x80\x82'*10)
Walter Dörwald28256f22003-01-19 16:59:20 +00001614 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001615 '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1616 '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1617 '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1618 '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1619 '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1620 ' Nunstuck git und'.encode('utf-8'),
Walter Dörwald67e83882007-05-05 12:26:27 +00001621 b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1622 b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1623 b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1624 b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1625 b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1626 b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1627 b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1628 b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1629 b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1630 b'\xe3\x80\x8cWenn ist das Nunstuck git und'
Walter Dörwald28256f22003-01-19 16:59:20 +00001631 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001632
Walter Dörwald28256f22003-01-19 16:59:20 +00001633 # UTF-8 specific decoding tests
Walter Dörwald67e83882007-05-05 12:26:27 +00001634 self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1635 self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1636 self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001637
Walter Dörwald28256f22003-01-19 16:59:20 +00001638 # Other possible utf-8 test cases:
1639 # * strict decoding testing for all of the
1640 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001641
Ezio Melotti57221d02010-07-01 07:32:02 +00001642 def test_utf8_decode_valid_sequences(self):
1643 sequences = [
1644 # single byte
1645 (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1646 # 2 bytes
1647 (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1648 # 3 bytes
1649 (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1650 (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1651 # 4 bytes
1652 (b'\xF0\x90\x80\x80', '\U00010000'),
1653 (b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1654 ]
1655 for seq, res in sequences:
1656 self.assertEqual(seq.decode('utf-8'), res)
1657
1658
1659 def test_utf8_decode_invalid_sequences(self):
1660 # continuation bytes in a sequence of 2, 3, or 4 bytes
1661 continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001662 # start bytes of a 2-byte sequence equivalent to code points < 0x7F
Ezio Melotti57221d02010-07-01 07:32:02 +00001663 invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001664 # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
Ezio Melotti57221d02010-07-01 07:32:02 +00001665 invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1666 invalid_start_bytes = (
1667 continuation_bytes + invalid_2B_seq_start_bytes +
1668 invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1669 )
1670
1671 for byte in invalid_start_bytes:
1672 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1673
1674 for sb in invalid_2B_seq_start_bytes:
1675 for cb in continuation_bytes:
1676 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1677
1678 for sb in invalid_4B_seq_start_bytes:
1679 for cb1 in continuation_bytes[:3]:
1680 for cb3 in continuation_bytes[:3]:
1681 self.assertRaises(UnicodeDecodeError,
1682 (sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1683
1684 for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1685 self.assertRaises(UnicodeDecodeError,
1686 (b'\xE0'+cb+b'\x80').decode, 'utf-8')
1687 self.assertRaises(UnicodeDecodeError,
1688 (b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1689 # surrogates
1690 for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1691 self.assertRaises(UnicodeDecodeError,
1692 (b'\xED'+cb+b'\x80').decode, 'utf-8')
1693 self.assertRaises(UnicodeDecodeError,
1694 (b'\xED'+cb+b'\xBF').decode, 'utf-8')
1695 for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1696 self.assertRaises(UnicodeDecodeError,
1697 (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1698 self.assertRaises(UnicodeDecodeError,
1699 (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1700 for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1701 self.assertRaises(UnicodeDecodeError,
1702 (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1703 self.assertRaises(UnicodeDecodeError,
1704 (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1705
1706 def test_issue8271(self):
1707 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1708 # only the start byte and the continuation byte(s) are now considered
1709 # invalid, instead of the number of bytes specified by the start byte.
1710 # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1711 # table 3-8, Row 2) for more information about the algorithm used.
1712 FFFD = '\ufffd'
1713 sequences = [
1714 # invalid start bytes
1715 (b'\x80', FFFD), # continuation byte
1716 (b'\x80\x80', FFFD*2), # 2 continuation bytes
1717 (b'\xc0', FFFD),
1718 (b'\xc0\xc0', FFFD*2),
1719 (b'\xc1', FFFD),
1720 (b'\xc1\xc0', FFFD*2),
1721 (b'\xc0\xc1', FFFD*2),
1722 # with start byte of a 2-byte sequence
1723 (b'\xc2', FFFD), # only the start byte
1724 (b'\xc2\xc2', FFFD*2), # 2 start bytes
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001725 (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes
Ezio Melotti57221d02010-07-01 07:32:02 +00001726 (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1727 # with start byte of a 3-byte sequence
1728 (b'\xe1', FFFD), # only the start byte
1729 (b'\xe1\xe1', FFFD*2), # 2 start bytes
1730 (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1731 (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1732 (b'\xe1\x80', FFFD), # only 1 continuation byte
1733 (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1734 (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1735 (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1736 (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1737 (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1738 (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1739 # with start byte of a 4-byte sequence
1740 (b'\xf1', FFFD), # only the start byte
1741 (b'\xf1\xf1', FFFD*2), # 2 start bytes
1742 (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1743 (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1744 (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1745 (b'\xf1\x80', FFFD), # only 1 continuation bytes
1746 (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1747 (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1748 (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1749 (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1750 (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1751 (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1752 (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1753 (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1754 (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1755 (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1756 (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1757 (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1758 # with invalid start byte of a 4-byte sequence (rfc2279)
1759 (b'\xf5', FFFD), # only the start byte
1760 (b'\xf5\xf5', FFFD*2), # 2 start bytes
1761 (b'\xf5\x80', FFFD*2), # only 1 continuation byte
1762 (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1763 (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1764 (b'\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
1765 (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1766 (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1767 # with invalid start byte of a 5-byte sequence (rfc2279)
1768 (b'\xf8', FFFD), # only the start byte
1769 (b'\xf8\xf8', FFFD*2), # 2 start bytes
1770 (b'\xf8\x80', FFFD*2), # only one continuation byte
1771 (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1772 (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1773 # with invalid start byte of a 6-byte sequence (rfc2279)
1774 (b'\xfc', FFFD), # only the start byte
1775 (b'\xfc\xfc', FFFD*2), # 2 start bytes
1776 (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1777 (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1778 # invalid start byte
1779 (b'\xfe', FFFD),
1780 (b'\xfe\x80\x80', FFFD*3),
1781 # other sequences
1782 (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1783 (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1784 (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1785 (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1786 '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1787 ]
1788 for n, (seq, res) in enumerate(sequences):
1789 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1790 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1791 self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1792 self.assertEqual(seq.decode('utf-8', 'ignore'),
1793 res.replace('\uFFFD', ''))
1794
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001795 def to_bytestring(self, seq):
1796 return bytes(int(c, 16) for c in seq.split())
1797
1798 def assertCorrectUTF8Decoding(self, seq, res, err):
1799 """
Martin Panter6245cb32016-04-15 02:14:19 +00001800 Check that an invalid UTF-8 sequence raises a UnicodeDecodeError when
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001801 'strict' is used, returns res when 'replace' is used, and that doesn't
1802 return anything when 'ignore' is used.
1803 """
1804 with self.assertRaises(UnicodeDecodeError) as cm:
1805 seq.decode('utf-8')
1806 exc = cm.exception
1807
1808 self.assertIn(err, str(exc))
1809 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1810 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'),
1811 'aaaa' + res + 'bbbb')
1812 res = res.replace('\ufffd', '')
1813 self.assertEqual(seq.decode('utf-8', 'ignore'), res)
1814 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'),
1815 'aaaa' + res + 'bbbb')
1816
1817 def test_invalid_start_byte(self):
1818 """
1819 Test that an 'invalid start byte' error is raised when the first byte
1820 is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
1821 4-bytes sequence. The invalid start byte is replaced with a single
1822 U+FFFD when errors='replace'.
1823 E.g. <80> is a continuation byte and can appear only after a start byte.
1824 """
1825 FFFD = '\ufffd'
1826 for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
1827 self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd',
1828 'invalid start byte')
1829
1830 def test_unexpected_end_of_data(self):
1831 """
1832 Test that an 'unexpected end of data' error is raised when the string
1833 ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
1834 enough continuation bytes. The incomplete sequence is replaced with a
1835 single U+FFFD when errors='replace'.
1836 E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
1837 sequence, but it's followed by only 2 valid continuation bytes and the
1838 last continuation bytes is missing.
1839 Note: the continuation bytes must be all valid, if one of them is
1840 invalid another error will be raised.
1841 """
1842 sequences = [
1843 'C2', 'DF',
1844 'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
1845 'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
1846 'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
1847 'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
1848 'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
1849 'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
1850 ]
1851 FFFD = '\ufffd'
1852 for seq in sequences:
1853 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), '\ufffd',
1854 'unexpected end of data')
1855
1856 def test_invalid_cb_for_2bytes_seq(self):
1857 """
1858 Test that an 'invalid continuation byte' error is raised when the
1859 continuation byte of a 2-bytes sequence is invalid. The start byte
1860 is replaced by a single U+FFFD and the second byte is handled
1861 separately when errors='replace'.
1862 E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
1863 sequence, but 41 is not a valid continuation byte because it's the
1864 ASCII letter 'A'.
1865 """
1866 FFFD = '\ufffd'
1867 FFFDx2 = FFFD * 2
1868 sequences = [
1869 ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'),
1870 ('C2 C0', FFFDx2), ('C2 FF', FFFDx2),
1871 ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'),
1872 ('DF C0', FFFDx2), ('DF FF', FFFDx2),
1873 ]
1874 for seq, res in sequences:
1875 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
1876 'invalid continuation byte')
1877
1878 def test_invalid_cb_for_3bytes_seq(self):
1879 """
1880 Test that an 'invalid continuation byte' error is raised when the
1881 continuation byte(s) of a 3-bytes sequence are invalid. When
1882 errors='replace', if the first continuation byte is valid, the first
1883 two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
1884 third byte is handled separately, otherwise only the start byte is
1885 replaced with a U+FFFD and the other continuation bytes are handled
1886 separately.
1887 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1888 sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1889 because it's the ASCII letter 'A'.
1890 Note: when the start byte is E0 or ED, the valid ranges for the first
1891 continuation byte are limited to A0..BF and 80..9F respectively.
1892 Python 2 used to consider all the bytes in range 80..BF valid when the
1893 start byte was ED. This is fixed in Python 3.
1894 """
1895 FFFD = '\ufffd'
1896 FFFDx2 = FFFD * 2
1897 sequences = [
1898 ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2),
1899 ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
1900 ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'),
1901 ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
1902 ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'),
1903 ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'),
1904 ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
1905 ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'),
1906 ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
1907 ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'),
1908 ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'),
1909 ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
1910 ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'),
1911 ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
1912 ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'),
1913 ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'),
1914 ('ED 7F', FFFD+'\x7f'),
1915 ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
1916 ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'),
1917 ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2),
1918 ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'),
1919 ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2),
1920 ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'),
1921 ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
1922 ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'),
1923 ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
1924 ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'),
1925 ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'),
1926 ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
1927 ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'),
1928 ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
1929 ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'),
1930 ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
1931 ]
1932 for seq, res in sequences:
1933 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
1934 'invalid continuation byte')
1935
1936 def test_invalid_cb_for_4bytes_seq(self):
1937 """
1938 Test that an 'invalid continuation byte' error is raised when the
1939 continuation byte(s) of a 4-bytes sequence are invalid. When
1940 errors='replace',the start byte and all the following valid
1941 continuation bytes are replaced with a single U+FFFD, and all the bytes
1942 starting from the first invalid continuation bytes (included) are
1943 handled separately.
1944 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1945 sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1946 because it's the ASCII letter 'A'.
1947 Note: when the start byte is E0 or ED, the valid ranges for the first
1948 continuation byte are limited to A0..BF and 80..9F respectively.
1949 However, when the start byte is ED, Python 2 considers all the bytes
1950 in range 80..BF valid. This is fixed in Python 3.
1951 """
1952 FFFD = '\ufffd'
1953 FFFDx2 = FFFD * 2
1954 sequences = [
1955 ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2),
1956 ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
1957 ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'),
1958 ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
1959 ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'),
1960 ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
1961 ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'),
1962 ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
1963 ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'),
1964 ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
1965 ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'),
1966 ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
1967 ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'),
1968 ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
1969 ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2),
1970 ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'),
1971 ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2),
1972 ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'),
1973 ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2),
1974 ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'),
1975 ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2),
1976 ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'),
1977 ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2),
1978 ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'),
1979 ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2),
1980 ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'),
1981 ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2),
1982 ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'),
1983 ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
1984 ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'),
1985 ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
1986 ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'),
1987 ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
1988 ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'),
1989 ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
1990 ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'),
1991 ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
1992 ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'),
1993 ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
1994 ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'),
1995 ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
1996 ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2),
1997 ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
1998 ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'),
1999 ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
2000 ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'),
2001 ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
2002 ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'),
2003 ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
2004 ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'),
2005 ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
2006 ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'),
2007 ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
2008 ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'),
2009 ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
2010 ]
2011 for seq, res in sequences:
2012 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
2013 'invalid continuation byte')
2014
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00002015 def test_codecs_idna(self):
2016 # Test whether trailing dot is preserved
Walter Dörwald1324c6f2007-05-11 19:57:05 +00002017 self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00002018
Walter Dörwald28256f22003-01-19 16:59:20 +00002019 def test_codecs_errors(self):
2020 # Error handling (encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002021 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
2022 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
Walter Dörwald67e83882007-05-05 12:26:27 +00002023 self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
2024 self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
Benjamin Peterson308d6372009-09-18 21:42:35 +00002025 self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
2026 'Andr\202 x'.encode('ascii', errors='replace'))
2027 self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
2028 'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002029
Walter Dörwald28256f22003-01-19 16:59:20 +00002030 # Error handling (decoding)
Walter Dörwald67e83882007-05-05 12:26:27 +00002031 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
2032 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
2033 self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
2034 self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03002035 self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002036
Walter Dörwald28256f22003-01-19 16:59:20 +00002037 # Error handling (unknown character names)
Guido van Rossum39478e82007-08-27 17:23:59 +00002038 self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002039
Walter Dörwald28256f22003-01-19 16:59:20 +00002040 # Error handling (truncated escape sequence)
Guido van Rossum9c627722007-08-27 18:31:48 +00002041 self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00002042
Guido van Rossum9c627722007-08-27 18:31:48 +00002043 self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
2044 self.assertRaises(TypeError, str, b"hello", "test.unicode2")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002045 self.assertRaises(TypeError, "hello".encode, "test.unicode1")
2046 self.assertRaises(TypeError, "hello".encode, "test.unicode2")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00002047
Walter Dörwald28256f22003-01-19 16:59:20 +00002048 # Error handling (wrong arguments)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002049 self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002050
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00002051 # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00002052 self.assertRaises(UnicodeError, float, "\ud800")
2053 self.assertRaises(UnicodeError, float, "\udf00")
2054 self.assertRaises(UnicodeError, complex, "\ud800")
2055 self.assertRaises(UnicodeError, complex, "\udf00")
Guido van Rossum97064862000-04-10 13:52:48 +00002056
Walter Dörwald28256f22003-01-19 16:59:20 +00002057 def test_codecs(self):
2058 # Encoding
Walter Dörwald67e83882007-05-05 12:26:27 +00002059 self.assertEqual('hello'.encode('ascii'), b'hello')
2060 self.assertEqual('hello'.encode('utf-7'), b'hello')
2061 self.assertEqual('hello'.encode('utf-8'), b'hello')
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002062 self.assertEqual('hello'.encode('utf-8'), b'hello')
Walter Dörwald67e83882007-05-05 12:26:27 +00002063 self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
2064 self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
2065 self.assertEqual('hello'.encode('latin-1'), b'hello')
Guido van Rossum97064862000-04-10 13:52:48 +00002066
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002067 # Default encoding is utf-8
2068 self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
2069
Walter Dörwald28256f22003-01-19 16:59:20 +00002070 # Roundtrip safety for BMP (just the first 1024 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002071 for c in range(1024):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002072 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002073 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
2074 'utf-16-be', 'raw_unicode_escape',
2075 'unicode_escape', 'unicode_internal'):
Victor Stinner040e16e2011-11-15 22:44:05 +01002076 with warnings.catch_warnings():
2077 # unicode-internal has been deprecated
2078 warnings.simplefilter("ignore", DeprecationWarning)
2079
2080 self.assertEqual(str(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002081
Walter Dörwald28256f22003-01-19 16:59:20 +00002082 # Roundtrip safety for BMP (just the first 256 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002083 for c in range(256):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002084 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002085 for encoding in ('latin-1',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002086 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002087
Walter Dörwald28256f22003-01-19 16:59:20 +00002088 # Roundtrip safety for BMP (just the first 128 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002089 for c in range(128):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002090 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002091 for encoding in ('ascii',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002092 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002093
Walter Dörwald28256f22003-01-19 16:59:20 +00002094 # Roundtrip safety for non-BMP (just a few chars)
Victor Stinner040e16e2011-11-15 22:44:05 +01002095 with warnings.catch_warnings():
2096 # unicode-internal has been deprecated
2097 warnings.simplefilter("ignore", DeprecationWarning)
2098
2099 u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
2100 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
2101 'raw_unicode_escape',
2102 'unicode_escape', 'unicode_internal'):
2103 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002104
Antoine Pitrou51f66482011-11-11 13:35:44 +01002105 # UTF-8 must be roundtrip safe for all code points
2106 # (except surrogates, which are forbidden).
2107 u = ''.join(map(chr, list(range(0, 0xd800)) +
Ezio Melotti40dc9192011-11-11 17:00:46 +02002108 list(range(0xe000, 0x110000))))
Walter Dörwald28256f22003-01-19 16:59:20 +00002109 for encoding in ('utf-8',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002110 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002111
Walter Dörwald28256f22003-01-19 16:59:20 +00002112 def test_codecs_charmap(self):
2113 # 0-127
Guido van Rossum805365e2007-05-07 22:24:25 +00002114 s = bytes(range(128))
Walter Dörwald28256f22003-01-19 16:59:20 +00002115 for encoding in (
Andrew Kuchlingad8156e2013-11-10 13:44:30 -05002116 'cp037', 'cp1026', 'cp273',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00002117 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2118 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02002119 'cp863', 'cp865', 'cp866', 'cp1125',
Walter Dörwald28256f22003-01-19 16:59:20 +00002120 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2121 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03002122 'iso8859_7', 'iso8859_9',
2123 'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'latin_1',
Walter Dörwald28256f22003-01-19 16:59:20 +00002124 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002125
Walter Dörwald28256f22003-01-19 16:59:20 +00002126 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2127 'cp1256', 'cp1257', 'cp1258',
2128 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002129
Walter Dörwald28256f22003-01-19 16:59:20 +00002130 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2131 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +00002132
Walter Dörwald28256f22003-01-19 16:59:20 +00002133 ### These have undefined mappings:
2134 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +00002135
Walter Dörwald28256f22003-01-19 16:59:20 +00002136 ### These fail the round-trip:
2137 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +00002138
Walter Dörwald28256f22003-01-19 16:59:20 +00002139 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002140 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002141
Walter Dörwald28256f22003-01-19 16:59:20 +00002142 # 128-255
Guido van Rossum805365e2007-05-07 22:24:25 +00002143 s = bytes(range(128, 256))
Walter Dörwald28256f22003-01-19 16:59:20 +00002144 for encoding in (
Andrew Kuchlingad8156e2013-11-10 13:44:30 -05002145 'cp037', 'cp1026', 'cp273',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00002146 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2147 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02002148 'cp863', 'cp865', 'cp866', 'cp1125',
Walter Dörwald28256f22003-01-19 16:59:20 +00002149 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2150 'iso8859_2', 'iso8859_4', 'iso8859_5',
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03002151 'iso8859_9', 'koi8_r', 'koi8_u', 'latin_1',
Walter Dörwald28256f22003-01-19 16:59:20 +00002152 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +00002153
Walter Dörwald28256f22003-01-19 16:59:20 +00002154 ### These have undefined mappings:
2155 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2156 #'cp1256', 'cp1257', 'cp1258',
2157 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03002158 #'iso8859_3', 'iso8859_6', 'iso8859_7', 'koi8_t', 'kz1048',
Walter Dörwald28256f22003-01-19 16:59:20 +00002159 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +00002160
Walter Dörwald28256f22003-01-19 16:59:20 +00002161 ### These fail the round-trip:
2162 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +00002163
Walter Dörwald28256f22003-01-19 16:59:20 +00002164 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002165 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002166
Walter Dörwald28256f22003-01-19 16:59:20 +00002167 def test_concatenation(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002168 self.assertEqual(("abc" "def"), "abcdef")
2169 self.assertEqual(("abc" "def"), "abcdef")
2170 self.assertEqual(("abc" "def"), "abcdef")
2171 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2172 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +00002173
Walter Dörwald28256f22003-01-19 16:59:20 +00002174 def test_printing(self):
2175 class BitBucket:
2176 def write(self, text):
2177 pass
Fred Drake004d5e62000-10-23 17:22:08 +00002178
Walter Dörwald28256f22003-01-19 16:59:20 +00002179 out = BitBucket()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002180 print('abc', file=out)
2181 print('abc', 'def', file=out)
2182 print('abc', 'def', file=out)
2183 print('abc', 'def', file=out)
2184 print('abc\n', file=out)
2185 print('abc\n', end=' ', file=out)
2186 print('abc\n', end=' ', file=out)
2187 print('def\n', file=out)
2188 print('def\n', file=out)
Fred Drake004d5e62000-10-23 17:22:08 +00002189
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002190 def test_ucs4(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002191 x = '\U00100000'
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002192 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
2193 self.assertEqual(x, y)
2194
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002195 y = br'\U00100000'
2196 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2197 self.assertEqual(x, y)
2198 y = br'\U00010000'
2199 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2200 self.assertEqual(x, y)
Christian Heimesfe337bf2008-03-23 21:54:12 +00002201
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002202 try:
2203 br'\U11111111'.decode("raw-unicode-escape")
2204 except UnicodeDecodeError as e:
2205 self.assertEqual(e.start, 0)
2206 self.assertEqual(e.end, 10)
2207 else:
2208 self.fail("Should have raised UnicodeDecodeError")
Christian Heimesfe337bf2008-03-23 21:54:12 +00002209
Brett Cannonc3647ac2005-04-26 03:45:26 +00002210 def test_conversion(self):
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002211 # Make sure __str__() works properly
2212 class ObjectToStr:
Brett Cannonc3647ac2005-04-26 03:45:26 +00002213 def __str__(self):
2214 return "foo"
2215
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002216 class StrSubclassToStr(str):
Guido van Rossum98297ee2007-11-06 21:34:58 +00002217 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002218 return "foo"
Brett Cannonc3647ac2005-04-26 03:45:26 +00002219
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002220 class StrSubclassToStrSubclass(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002221 def __new__(cls, content=""):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002222 return str.__new__(cls, 2*content)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002223 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002224 return self
2225
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002226 self.assertEqual(str(ObjectToStr()), "foo")
2227 self.assertEqual(str(StrSubclassToStr("bar")), "foo")
2228 s = str(StrSubclassToStrSubclass("foo"))
2229 self.assertEqual(s, "foofoo")
2230 self.assertIs(type(s), StrSubclassToStrSubclass)
Serhiy Storchaka15095802015-11-25 15:47:01 +02002231 s = StrSubclass(StrSubclassToStrSubclass("foo"))
2232 self.assertEqual(s, "foofoo")
2233 self.assertIs(type(s), StrSubclass)
Brett Cannonc3647ac2005-04-26 03:45:26 +00002234
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002235 def test_unicode_repr(self):
2236 class s1:
2237 def __repr__(self):
2238 return '\\n'
2239
2240 class s2:
2241 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002242 return '\\n'
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002243
2244 self.assertEqual(repr(s1()), '\\n')
2245 self.assertEqual(repr(s2()), '\\n')
2246
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002247 def test_printable_repr(self):
2248 self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
Martin v. Löwisbaecd722010-10-11 22:42:28 +00002249 self.assertEqual(repr('\U00014000'), "'\\U00014000'") # nonprintable
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002250
Zachary Ware9fe6d862013-12-08 00:20:35 -06002251 # This test only affects 32-bit platforms because expandtabs can only take
2252 # an int as the max value, not a 64-bit C long. If expandtabs is changed
2253 # to take a 64-bit long, this test should apply to all platforms.
2254 @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4,
2255 'only applies to 32-bit platforms')
Guido van Rossumcd16bf62007-06-13 18:07:49 +00002256 def test_expandtabs_overflows_gracefully(self):
Christian Heimesa37d4c62007-12-04 23:02:19 +00002257 self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002258
Victor Stinner1d972ad2011-10-07 13:31:46 +02002259 @support.cpython_only
Antoine Pitroue19aa382011-10-04 16:04:01 +02002260 def test_expandtabs_optimization(self):
2261 s = 'abc'
2262 self.assertIs(s.expandtabs(), s)
2263
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00002264 def test_raiseMemError(self):
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 if struct.calcsize('P') == 8:
2266 # 64 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02002267 ascii_struct_size = 48
2268 compact_struct_size = 72
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002269 else:
2270 # 32 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02002271 ascii_struct_size = 24
2272 compact_struct_size = 36
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273
2274 for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
2275 code = ord(char)
2276 if code < 0x100:
2277 char_size = 1 # sizeof(Py_UCS1)
2278 struct_size = ascii_struct_size
2279 elif code < 0x10000:
2280 char_size = 2 # sizeof(Py_UCS2)
2281 struct_size = compact_struct_size
2282 else:
2283 char_size = 4 # sizeof(Py_UCS4)
2284 struct_size = compact_struct_size
2285 # Note: sys.maxsize is half of the actual max allocation because of
Martin v. Löwis287eca62011-09-28 10:03:28 +02002286 # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
2287 # be allocatable, given enough memory.
2288 maxlen = ((sys.maxsize - struct_size) // char_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002289 alloc = lambda: char * maxlen
2290 self.assertRaises(MemoryError, alloc)
2291 self.assertRaises(MemoryError, alloc)
Antoine Pitrou3db3e872008-08-17 17:06:51 +00002292
Victor Stinner808fc0a2010-03-22 12:50:40 +00002293 def test_format_subclass(self):
2294 class S(str):
2295 def __str__(self):
2296 return '__str__ overridden'
2297 s = S('xxx')
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002298 self.assertEqual("%s" % s, '__str__ overridden')
2299 self.assertEqual("{}".format(s), '__str__ overridden')
Victor Stinner808fc0a2010-03-22 12:50:40 +00002300
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002301 # Test PyUnicode_FromFormat()
Victor Stinner1205f272010-09-11 00:54:47 +00002302 def test_from_format(self):
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002303 support.import_module('ctypes')
Victor Stinner15a11362012-10-06 23:48:20 +02002304 from ctypes import (
2305 pythonapi, py_object, sizeof,
Victor Stinner6d970f42011-03-02 00:04:25 +00002306 c_int, c_long, c_longlong, c_ssize_t,
Victor Stinner15a11362012-10-06 23:48:20 +02002307 c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002308 name = "PyUnicode_FromFormat"
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002309 _PyUnicode_FromFormat = getattr(pythonapi, name)
2310 _PyUnicode_FromFormat.restype = py_object
2311
2312 def PyUnicode_FromFormat(format, *args):
2313 cargs = tuple(
2314 py_object(arg) if isinstance(arg, str) else arg
2315 for arg in args)
2316 return _PyUnicode_FromFormat(format, *cargs)
Victor Stinner1205f272010-09-11 00:54:47 +00002317
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002318 def check_format(expected, format, *args):
2319 text = PyUnicode_FromFormat(format, *args)
2320 self.assertEqual(expected, text)
2321
Victor Stinner1205f272010-09-11 00:54:47 +00002322 # ascii format, non-ascii argument
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002323 check_format('ascii\x7f=unicode\xe9',
2324 b'ascii\x7f=%U', 'unicode\xe9')
Victor Stinner1205f272010-09-11 00:54:47 +00002325
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002326 # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
2327 # raises an error
Ezio Melottied3a7d22010-12-01 02:32:32 +00002328 self.assertRaisesRegex(ValueError,
R David Murray44b548d2016-09-08 13:59:53 -04002329 r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
Victor Stinner4c7db312010-09-12 07:51:18 +00002330 'string, got a non-ASCII byte: 0xe9$',
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002331 PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00002332
Victor Stinner96865452011-03-01 23:44:09 +00002333 # test "%c"
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002334 check_format('\uabcd',
2335 b'%c', c_int(0xabcd))
2336 check_format('\U0010ffff',
2337 b'%c', c_int(0x10ffff))
Serhiy Storchaka8eeae212013-06-23 20:12:14 +03002338 with self.assertRaises(OverflowError):
2339 PyUnicode_FromFormat(b'%c', c_int(0x110000))
Serhiy Storchaka31b1c8b2013-06-12 09:20:44 +03002340 # Issue #18183
Serhiy Storchakaf15ffe02013-06-12 09:28:20 +03002341 check_format('\U00010000\U00100000',
2342 b'%c%c', c_int(0x10000), c_int(0x100000))
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002343
Victor Stinner96865452011-03-01 23:44:09 +00002344 # test "%"
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002345 check_format('%',
2346 b'%')
2347 check_format('%',
2348 b'%%')
2349 check_format('%s',
2350 b'%%s')
2351 check_format('[%]',
2352 b'[%%]')
2353 check_format('%abc',
2354 b'%%%s', b'abc')
2355
2356 # truncated string
2357 check_format('abc',
2358 b'%.3s', b'abcdef')
2359 check_format('abc[\ufffd',
2360 b'%.5s', 'abc[\u20ac]'.encode('utf8'))
2361 check_format("'\\u20acABC'",
2362 b'%A', '\u20acABC')
2363 check_format("'\\u20",
2364 b'%.5A', '\u20acABCDEF')
2365 check_format("'\u20acABC'",
2366 b'%R', '\u20acABC')
2367 check_format("'\u20acA",
2368 b'%.3R', '\u20acABCDEF')
2369 check_format('\u20acAB',
2370 b'%.3S', '\u20acABCDEF')
2371 check_format('\u20acAB',
2372 b'%.3U', '\u20acABCDEF')
2373 check_format('\u20acAB',
2374 b'%.3V', '\u20acABCDEF', None)
2375 check_format('abc[\ufffd',
2376 b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
2377
2378 # following tests comes from #7330
2379 # test width modifier and precision modifier with %S
2380 check_format("repr= abc",
2381 b'repr=%5S', 'abc')
2382 check_format("repr=ab",
2383 b'repr=%.2S', 'abc')
2384 check_format("repr= ab",
2385 b'repr=%5.2S', 'abc')
2386
2387 # test width modifier and precision modifier with %R
2388 check_format("repr= 'abc'",
2389 b'repr=%8R', 'abc')
2390 check_format("repr='ab",
2391 b'repr=%.3R', 'abc')
2392 check_format("repr= 'ab",
2393 b'repr=%5.3R', 'abc')
2394
2395 # test width modifier and precision modifier with %A
2396 check_format("repr= 'abc'",
2397 b'repr=%8A', 'abc')
2398 check_format("repr='ab",
2399 b'repr=%.3A', 'abc')
2400 check_format("repr= 'ab",
2401 b'repr=%5.3A', 'abc')
2402
2403 # test width modifier and precision modifier with %s
2404 check_format("repr= abc",
2405 b'repr=%5s', b'abc')
2406 check_format("repr=ab",
2407 b'repr=%.2s', b'abc')
2408 check_format("repr= ab",
2409 b'repr=%5.2s', b'abc')
2410
2411 # test width modifier and precision modifier with %U
2412 check_format("repr= abc",
2413 b'repr=%5U', 'abc')
2414 check_format("repr=ab",
2415 b'repr=%.2U', 'abc')
2416 check_format("repr= ab",
2417 b'repr=%5.2U', 'abc')
2418
2419 # test width modifier and precision modifier with %V
2420 check_format("repr= abc",
2421 b'repr=%5V', 'abc', b'123')
2422 check_format("repr=ab",
2423 b'repr=%.2V', 'abc', b'123')
2424 check_format("repr= ab",
2425 b'repr=%5.2V', 'abc', b'123')
2426 check_format("repr= 123",
2427 b'repr=%5V', None, b'123')
2428 check_format("repr=12",
2429 b'repr=%.2V', None, b'123')
2430 check_format("repr= 12",
2431 b'repr=%5.2V', None, b'123')
Victor Stinner96865452011-03-01 23:44:09 +00002432
Victor Stinner6d970f42011-03-02 00:04:25 +00002433 # test integer formats (%i, %d, %u)
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002434 check_format('010',
2435 b'%03i', c_int(10))
2436 check_format('0010',
2437 b'%0.4i', c_int(10))
2438 check_format('-123',
2439 b'%i', c_int(-123))
2440 check_format('-123',
2441 b'%li', c_long(-123))
2442 check_format('-123',
2443 b'%lli', c_longlong(-123))
2444 check_format('-123',
2445 b'%zi', c_ssize_t(-123))
Victor Stinner96865452011-03-01 23:44:09 +00002446
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002447 check_format('-123',
2448 b'%d', c_int(-123))
2449 check_format('-123',
2450 b'%ld', c_long(-123))
2451 check_format('-123',
2452 b'%lld', c_longlong(-123))
2453 check_format('-123',
2454 b'%zd', c_ssize_t(-123))
Victor Stinner96865452011-03-01 23:44:09 +00002455
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002456 check_format('123',
2457 b'%u', c_uint(123))
2458 check_format('123',
2459 b'%lu', c_ulong(123))
2460 check_format('123',
2461 b'%llu', c_ulonglong(123))
2462 check_format('123',
2463 b'%zu', c_size_t(123))
Victor Stinner6d970f42011-03-02 00:04:25 +00002464
Victor Stinner15a11362012-10-06 23:48:20 +02002465 # test long output
2466 min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
2467 max_longlong = -min_longlong - 1
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002468 check_format(str(min_longlong),
2469 b'%lld', c_longlong(min_longlong))
2470 check_format(str(max_longlong),
2471 b'%lld', c_longlong(max_longlong))
Victor Stinner15a11362012-10-06 23:48:20 +02002472 max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002473 check_format(str(max_ulonglong),
2474 b'%llu', c_ulonglong(max_ulonglong))
Victor Stinner15a11362012-10-06 23:48:20 +02002475 PyUnicode_FromFormat(b'%p', c_void_p(-1))
2476
Victor Stinnere215d962012-10-06 23:03:36 +02002477 # test padding (width and/or precision)
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002478 check_format('123'.rjust(10, '0'),
2479 b'%010i', c_int(123))
2480 check_format('123'.rjust(100),
2481 b'%100i', c_int(123))
2482 check_format('123'.rjust(100, '0'),
2483 b'%.100i', c_int(123))
2484 check_format('123'.rjust(80, '0').rjust(100),
2485 b'%100.80i', c_int(123))
Victor Stinnere215d962012-10-06 23:03:36 +02002486
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002487 check_format('123'.rjust(10, '0'),
2488 b'%010u', c_uint(123))
2489 check_format('123'.rjust(100),
2490 b'%100u', c_uint(123))
2491 check_format('123'.rjust(100, '0'),
2492 b'%.100u', c_uint(123))
2493 check_format('123'.rjust(80, '0').rjust(100),
2494 b'%100.80u', c_uint(123))
Victor Stinnere215d962012-10-06 23:03:36 +02002495
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002496 check_format('123'.rjust(10, '0'),
2497 b'%010x', c_int(0x123))
2498 check_format('123'.rjust(100),
2499 b'%100x', c_int(0x123))
2500 check_format('123'.rjust(100, '0'),
2501 b'%.100x', c_int(0x123))
2502 check_format('123'.rjust(80, '0').rjust(100),
2503 b'%100.80x', c_int(0x123))
Victor Stinnere215d962012-10-06 23:03:36 +02002504
Victor Stinner6d970f42011-03-02 00:04:25 +00002505 # test %A
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002506 check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
2507 b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
Victor Stinner9a909002010-10-18 20:59:24 +00002508
Victor Stinner6d970f42011-03-02 00:04:25 +00002509 # test %V
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002510 check_format('repr=abc',
2511 b'repr=%V', 'abc', b'xyz')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002512
2513 # Test string decode from parameter of %s using utf-8.
2514 # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
2515 # '\u4eba\u6c11'
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002516 check_format('repr=\u4eba\u6c11',
2517 b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002518
2519 #Test replace error handler.
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002520 check_format('repr=abc\ufffd',
2521 b'repr=%V', None, b'abc\xff')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002522
Victor Stinner6d970f42011-03-02 00:04:25 +00002523 # not supported: copy the raw format string. these tests are just here
2524 # to check for crashs and should not be considered as specifications
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002525 check_format('%s',
2526 b'%1%s', b'abc')
2527 check_format('%1abc',
2528 b'%1abc')
2529 check_format('%+i',
2530 b'%+i', c_int(10))
2531 check_format('%.%s',
2532 b'%.%s', b'abc')
Victor Stinner6d970f42011-03-02 00:04:25 +00002533
Victor Stinner1c24bd02010-10-02 11:03:13 +00002534 # Test PyUnicode_AsWideChar()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002535 @support.cpython_only
Victor Stinner1c24bd02010-10-02 11:03:13 +00002536 def test_aswidechar(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002537 from _testcapi import unicode_aswidechar
Antoine Pitrou0662bc22010-11-22 16:19:04 +00002538 support.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002539 from ctypes import c_wchar, sizeof
2540
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002541 wchar, size = unicode_aswidechar('abcdef', 2)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002542 self.assertEqual(size, 2)
2543 self.assertEqual(wchar, 'ab')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002544
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002545 wchar, size = unicode_aswidechar('abc', 3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002546 self.assertEqual(size, 3)
2547 self.assertEqual(wchar, 'abc')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002548
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002549 wchar, size = unicode_aswidechar('abc', 4)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002550 self.assertEqual(size, 3)
2551 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002552
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002553 wchar, size = unicode_aswidechar('abc', 10)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002554 self.assertEqual(size, 3)
2555 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002556
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002557 wchar, size = unicode_aswidechar('abc\0def', 20)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002558 self.assertEqual(size, 7)
2559 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002560
Victor Stinner5593d8a2010-10-02 11:11:27 +00002561 nonbmp = chr(0x10ffff)
2562 if sizeof(c_wchar) == 2:
2563 buflen = 3
2564 nchar = 2
2565 else: # sizeof(c_wchar) == 4
2566 buflen = 2
2567 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002568 wchar, size = unicode_aswidechar(nonbmp, buflen)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002569 self.assertEqual(size, nchar)
2570 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00002571
Victor Stinner1c24bd02010-10-02 11:03:13 +00002572 # Test PyUnicode_AsWideCharString()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002573 @support.cpython_only
Victor Stinner1c24bd02010-10-02 11:03:13 +00002574 def test_aswidecharstring(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002575 from _testcapi import unicode_aswidecharstring
Antoine Pitrou0662bc22010-11-22 16:19:04 +00002576 support.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002577 from ctypes import c_wchar, sizeof
2578
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002579 wchar, size = unicode_aswidecharstring('abc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00002580 self.assertEqual(size, 3)
2581 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002582
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002583 wchar, size = unicode_aswidecharstring('abc\0def')
Ezio Melottib3aedd42010-11-20 19:04:17 +00002584 self.assertEqual(size, 7)
2585 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002586
Victor Stinner5593d8a2010-10-02 11:11:27 +00002587 nonbmp = chr(0x10ffff)
2588 if sizeof(c_wchar) == 2:
2589 nchar = 2
2590 else: # sizeof(c_wchar) == 4
2591 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002592 wchar, size = unicode_aswidecharstring(nonbmp)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002593 self.assertEqual(size, nchar)
2594 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00002595
Benjamin Peterson811c2f12011-09-30 21:31:21 -04002596 def test_subclass_add(self):
2597 class S(str):
2598 def __add__(self, o):
2599 return "3"
2600 self.assertEqual(S("4") + S("5"), "3")
2601 class S(str):
2602 def __iadd__(self, o):
2603 return "3"
2604 s = S("1")
2605 s += "4"
2606 self.assertEqual(s, "3")
2607
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002608 @support.cpython_only
Victor Stinner42bf7752011-11-21 22:52:58 +01002609 def test_encode_decimal(self):
2610 from _testcapi import unicode_encodedecimal
2611 self.assertEqual(unicode_encodedecimal('123'),
2612 b'123')
2613 self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
2614 b'3.14')
2615 self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2616 b' 3.14 ')
2617 self.assertRaises(UnicodeEncodeError,
2618 unicode_encodedecimal, "123\u20ac", "strict")
Victor Stinner6345be92011-11-25 20:09:01 +01002619 self.assertRaisesRegex(
2620 ValueError,
2621 "^'decimal' codec can't encode character",
2622 unicode_encodedecimal, "123\u20ac", "replace")
Victor Stinner42bf7752011-11-21 22:52:58 +01002623
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002624 @support.cpython_only
Victor Stinner42bf7752011-11-21 22:52:58 +01002625 def test_transform_decimal(self):
2626 from _testcapi import unicode_transformdecimaltoascii as transform_decimal
2627 self.assertEqual(transform_decimal('123'),
2628 '123')
2629 self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
2630 '3.14')
2631 self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2632 "\N{EM SPACE}3.14\N{EN SPACE}")
2633 self.assertEqual(transform_decimal('123\u20ac'),
2634 '123\u20ac')
2635
Victor Stinnerc814a382011-11-22 01:06:15 +01002636 def test_getnewargs(self):
2637 text = 'abc'
2638 args = text.__getnewargs__()
2639 self.assertIsNot(args[0], text)
2640 self.assertEqual(args[0], text)
2641 self.assertEqual(len(args), 1)
2642
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01002643 def test_resize(self):
2644 for length in range(1, 100, 7):
2645 # generate a fresh string (refcount=1)
2646 text = 'a' * length + 'b'
2647
Ezio Melotti51e243f2013-02-20 23:56:01 +02002648 with support.check_warnings(('unicode_internal codec has been '
2649 'deprecated', DeprecationWarning)):
2650 # fill wstr internal field
2651 abc = text.encode('unicode_internal')
2652 self.assertEqual(abc.decode('unicode_internal'), text)
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01002653
Ezio Melotti51e243f2013-02-20 23:56:01 +02002654 # resize text: wstr field must be cleared and then recomputed
2655 text += 'c'
2656 abcdef = text.encode('unicode_internal')
2657 self.assertNotEqual(abc, abcdef)
2658 self.assertEqual(abcdef.decode('unicode_internal'), text)
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01002659
Victor Stinner9fc59812013-04-08 22:34:43 +02002660 def test_compare(self):
2661 # Issue #17615
2662 N = 10
2663 ascii = 'a' * N
2664 ascii2 = 'z' * N
2665 latin = '\x80' * N
2666 latin2 = '\xff' * N
2667 bmp = '\u0100' * N
2668 bmp2 = '\uffff' * N
2669 astral = '\U00100000' * N
2670 astral2 = '\U0010ffff' * N
2671 strings = (
2672 ascii, ascii2,
2673 latin, latin2,
2674 bmp, bmp2,
2675 astral, astral2)
2676 for text1, text2 in itertools.combinations(strings, 2):
2677 equal = (text1 is text2)
2678 self.assertEqual(text1 == text2, equal)
2679 self.assertEqual(text1 != text2, not equal)
2680
2681 if equal:
2682 self.assertTrue(text1 <= text2)
2683 self.assertTrue(text1 >= text2)
2684
2685 # text1 is text2: duplicate strings to skip the "str1 == str2"
2686 # optimization in unicode_compare_eq() and really compare
2687 # character per character
2688 copy1 = duplicate_string(text1)
2689 copy2 = duplicate_string(text2)
2690 self.assertIsNot(copy1, copy2)
2691
2692 self.assertTrue(copy1 == copy2)
2693 self.assertFalse(copy1 != copy2)
2694
2695 self.assertTrue(copy1 <= copy2)
2696 self.assertTrue(copy2 >= copy2)
2697
2698 self.assertTrue(ascii < ascii2)
2699 self.assertTrue(ascii < latin)
2700 self.assertTrue(ascii < bmp)
2701 self.assertTrue(ascii < astral)
2702 self.assertFalse(ascii >= ascii2)
2703 self.assertFalse(ascii >= latin)
2704 self.assertFalse(ascii >= bmp)
2705 self.assertFalse(ascii >= astral)
2706
2707 self.assertFalse(latin < ascii)
2708 self.assertTrue(latin < latin2)
2709 self.assertTrue(latin < bmp)
2710 self.assertTrue(latin < astral)
2711 self.assertTrue(latin >= ascii)
2712 self.assertFalse(latin >= latin2)
2713 self.assertFalse(latin >= bmp)
2714 self.assertFalse(latin >= astral)
2715
2716 self.assertFalse(bmp < ascii)
2717 self.assertFalse(bmp < latin)
2718 self.assertTrue(bmp < bmp2)
2719 self.assertTrue(bmp < astral)
2720 self.assertTrue(bmp >= ascii)
2721 self.assertTrue(bmp >= latin)
2722 self.assertFalse(bmp >= bmp2)
2723 self.assertFalse(bmp >= astral)
2724
2725 self.assertFalse(astral < ascii)
2726 self.assertFalse(astral < latin)
2727 self.assertFalse(astral < bmp2)
2728 self.assertTrue(astral < astral2)
2729 self.assertTrue(astral >= ascii)
2730 self.assertTrue(astral >= latin)
2731 self.assertTrue(astral >= bmp2)
2732 self.assertFalse(astral >= astral2)
2733
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02002734 @support.cpython_only
2735 def test_pep393_utf8_caching_bug(self):
2736 # Issue #25709: Problem with string concatenation and utf-8 cache
2737 from _testcapi import getargs_s_hash
2738 for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
2739 s = ''
2740 for i in range(5):
2741 # Due to CPython specific optimization the 's' string can be
2742 # resized in-place.
2743 s += chr(k)
2744 # Parsing with the "s#" format code calls indirectly
2745 # PyUnicode_AsUTF8AndSize() which creates the UTF-8
2746 # encoded string cached in the Unicode object.
2747 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2748 # Check that the second call returns the same result
2749 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2750
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +03002751 def test_free_after_iterating(self):
2752 support.check_free_after_iterating(self, iter, str)
2753 support.check_free_after_iterating(self, reversed, str)
2754
Victor Stinner1c24bd02010-10-02 11:03:13 +00002755
Eric Smitha1eac722011-01-29 11:15:35 +00002756class StringModuleTest(unittest.TestCase):
2757 def test_formatter_parser(self):
2758 def parse(format):
2759 return list(_string.formatter_parser(format))
2760
2761 formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
2762 self.assertEqual(formatter, [
2763 ('prefix ', '2', '', 's'),
2764 ('xxx', '0', '^+10.3f', None),
2765 ('', 'obj.attr', '', 's'),
2766 (' ', 'z[0]', '10', 's'),
2767 ])
2768
2769 formatter = parse("prefix {} suffix")
2770 self.assertEqual(formatter, [
2771 ('prefix ', '', '', None),
2772 (' suffix', None, None, None),
2773 ])
2774
2775 formatter = parse("str")
2776 self.assertEqual(formatter, [
2777 ('str', None, None, None),
2778 ])
2779
2780 formatter = parse("")
2781 self.assertEqual(formatter, [])
2782
2783 formatter = parse("{0}")
2784 self.assertEqual(formatter, [
2785 ('', '0', '', None),
2786 ])
2787
2788 self.assertRaises(TypeError, _string.formatter_parser, 1)
2789
2790 def test_formatter_field_name_split(self):
2791 def split(name):
2792 items = list(_string.formatter_field_name_split(name))
2793 items[1] = list(items[1])
2794 return items
2795 self.assertEqual(split("obj"), ["obj", []])
2796 self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
2797 self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
2798 self.assertEqual(split("obj.arg[key1][key2]"), [
2799 "obj",
2800 [(True, 'arg'),
2801 (False, 'key1'),
2802 (False, 'key2'),
2803 ]])
2804 self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
2805
2806
Walter Dörwald28256f22003-01-19 16:59:20 +00002807if __name__ == "__main__":
Ezio Melotti0dceb562013-01-10 07:43:26 +02002808 unittest.main()