blob: fac8b7b6022a471c35db2cfe989c103d2eb46789 [file] [log] [blame]
Guido van Rossuma831cac2000-03-10 23:23:21 +00001""" Test script for the Unicode implementation.
2
Guido van Rossuma831cac2000-03-10 23:23:21 +00003Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
Marc-André Lemburg36619082001-01-17 19:11:13 +00007"""#"
Victor Stinner040e16e2011-11-15 22:44:05 +01008import _string
Guido van Rossum98297ee2007-11-06 21:34:58 +00009import codecs
Victor Stinner9fc59812013-04-08 22:34:43 +020010import itertools
Ethan Furman9ab74802014-03-21 06:38:46 -070011import operator
Guido van Rossum98297ee2007-11-06 21:34:58 +000012import struct
13import sys
14import unittest
15import warnings
Benjamin Petersonee8712c2008-05-20 21:35:26 +000016from test import support, string_tests
Guido van Rossuma831cac2000-03-10 23:23:21 +000017
Neal Norwitz430f68b2005-11-24 22:00:56 +000018# Error handling (bad decoder return)
19def search_function(encoding):
20 def decode1(input, errors="strict"):
21 return 42 # not a tuple
22 def encode1(input, errors="strict"):
23 return 42 # not a tuple
24 def encode2(input, errors="strict"):
25 return (42, 42) # no unicode
26 def decode2(input, errors="strict"):
27 return (42, 42) # no unicode
28 if encoding=="test.unicode1":
29 return (encode1, decode1, None, None)
30 elif encoding=="test.unicode2":
31 return (encode2, decode2, None, None)
32 else:
33 return None
34codecs.register(search_function)
35
Victor Stinner9fc59812013-04-08 22:34:43 +020036def duplicate_string(text):
37 """
38 Try to get a fresh clone of the specified text:
39 new object with a reference count of 1.
40
41 This is a best-effort: latin1 single letters and the empty
42 string ('') are singletons and cannot be cloned.
43 """
44 return text.encode().decode()
45
Serhiy Storchaka15095802015-11-25 15:47:01 +020046class StrSubclass(str):
47 pass
48
Brett Cannon226b2302010-03-20 22:22:22 +000049class UnicodeTest(string_tests.CommonTest,
50 string_tests.MixinStrUnicodeUserStringTest,
Ezio Melotti0dceb562013-01-10 07:43:26 +020051 string_tests.MixinStrUnicodeTest,
52 unittest.TestCase):
Brett Cannon226b2302010-03-20 22:22:22 +000053
Guido van Rossumef87d6e2007-05-02 19:09:54 +000054 type2test = str
Walter Dörwald0fd583c2003-02-21 12:53:50 +000055
56 def checkequalnofix(self, result, object, methodname, *args):
57 method = getattr(object, methodname)
58 realresult = method(*args)
59 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000060 self.assertTrue(type(realresult) is type(result))
Walter Dörwald0fd583c2003-02-21 12:53:50 +000061
62 # if the original is returned make sure that
63 # this doesn't happen with subclasses
64 if realresult is object:
Guido van Rossumef87d6e2007-05-02 19:09:54 +000065 class usub(str):
Walter Dörwald0fd583c2003-02-21 12:53:50 +000066 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000067 return 'usub(%r)' % str.__repr__(self)
Walter Dörwald0fd583c2003-02-21 12:53:50 +000068 object = usub(object)
69 method = getattr(object, methodname)
70 realresult = method(*args)
71 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000072 self.assertTrue(object is not realresult)
Guido van Rossume4874ae2001-09-21 15:36:41 +000073
Jeremy Hylton504de6b2003-10-06 05:08:26 +000074 def test_literals(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000075 self.assertEqual('\xff', '\u00ff')
76 self.assertEqual('\uffff', '\U0000ffff')
Guido van Rossum36e0a922007-07-20 04:05:57 +000077 self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
78 self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
79 self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
Benjamin Petersoncd76c272008-04-05 15:09:30 +000080 # raw strings should not have unicode escapes
Florent Xiclunaa87b3832010-09-13 02:28:18 +000081 self.assertNotEqual(r"\u0020", " ")
Jeremy Hylton504de6b2003-10-06 05:08:26 +000082
Georg Brandl559e5d72008-06-11 18:37:52 +000083 def test_ascii(self):
84 if not sys.platform.startswith('java'):
85 # Test basic sanity of repr()
86 self.assertEqual(ascii('abc'), "'abc'")
87 self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
88 self.assertEqual(ascii('ab\\'), "'ab\\\\'")
89 self.assertEqual(ascii('\\c'), "'\\\\c'")
90 self.assertEqual(ascii('\\'), "'\\\\'")
91 self.assertEqual(ascii('\n'), "'\\n'")
92 self.assertEqual(ascii('\r'), "'\\r'")
93 self.assertEqual(ascii('\t'), "'\\t'")
94 self.assertEqual(ascii('\b'), "'\\x08'")
95 self.assertEqual(ascii("'\""), """'\\'"'""")
96 self.assertEqual(ascii("'\""), """'\\'"'""")
97 self.assertEqual(ascii("'"), '''"'"''')
98 self.assertEqual(ascii('"'), """'"'""")
99 latin1repr = (
100 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
101 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
102 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
103 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
104 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
105 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
106 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
107 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
108 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
109 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
110 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
111 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
112 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
113 "\\xfe\\xff'")
114 testrepr = ascii(''.join(map(chr, range(256))))
115 self.assertEqual(testrepr, latin1repr)
116 # Test ascii works on wide unicode escapes without overflow.
117 self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
118 ascii("\U00010000" * 39 + "\uffff" * 4096))
119
120 class WrongRepr:
121 def __repr__(self):
122 return b'byte-repr'
123 self.assertRaises(TypeError, ascii, WrongRepr())
124
Walter Dörwald28256f22003-01-19 16:59:20 +0000125 def test_repr(self):
126 if not sys.platform.startswith('java'):
127 # Test basic sanity of repr()
Walter Dörwald67e83882007-05-05 12:26:27 +0000128 self.assertEqual(repr('abc'), "'abc'")
129 self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
130 self.assertEqual(repr('ab\\'), "'ab\\\\'")
131 self.assertEqual(repr('\\c'), "'\\\\c'")
132 self.assertEqual(repr('\\'), "'\\\\'")
133 self.assertEqual(repr('\n'), "'\\n'")
134 self.assertEqual(repr('\r'), "'\\r'")
135 self.assertEqual(repr('\t'), "'\\t'")
136 self.assertEqual(repr('\b'), "'\\x08'")
137 self.assertEqual(repr("'\""), """'\\'"'""")
138 self.assertEqual(repr("'\""), """'\\'"'""")
139 self.assertEqual(repr("'"), '''"'"''')
140 self.assertEqual(repr('"'), """'"'""")
Walter Dörwald28256f22003-01-19 16:59:20 +0000141 latin1repr = (
Walter Dörwald67e83882007-05-05 12:26:27 +0000142 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
Walter Dörwald28256f22003-01-19 16:59:20 +0000143 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
144 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
145 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
146 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
147 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
Georg Brandl559e5d72008-06-11 18:37:52 +0000148 "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
149 "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
150 "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
151 "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
152 "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
153 "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
154 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
155 "\xfe\xff'")
Guido van Rossum805365e2007-05-07 22:24:25 +0000156 testrepr = repr(''.join(map(chr, range(256))))
Walter Dörwald28256f22003-01-19 16:59:20 +0000157 self.assertEqual(testrepr, latin1repr)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000158 # Test repr works on wide unicode escapes without overflow.
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000159 self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
160 repr("\U00010000" * 39 + "\uffff" * 4096))
Walter Dörwald28256f22003-01-19 16:59:20 +0000161
Georg Brandl559e5d72008-06-11 18:37:52 +0000162 class WrongRepr:
163 def __repr__(self):
164 return b'byte-repr'
165 self.assertRaises(TypeError, repr, WrongRepr())
166
Guido van Rossum49d6b072006-08-17 21:11:47 +0000167 def test_iterators(self):
168 # Make sure unicode objects have an __iter__ method
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000169 it = "\u1111\u2222\u3333".__iter__()
170 self.assertEqual(next(it), "\u1111")
171 self.assertEqual(next(it), "\u2222")
172 self.assertEqual(next(it), "\u3333")
Georg Brandla18af4e2007-04-21 15:47:16 +0000173 self.assertRaises(StopIteration, next, it)
Guido van Rossum49d6b072006-08-17 21:11:47 +0000174
Walter Dörwald28256f22003-01-19 16:59:20 +0000175 def test_count(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000176 string_tests.CommonTest.test_count(self)
177 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000178 self.checkequalnofix(3, 'aaa', 'count', 'a')
179 self.checkequalnofix(0, 'aaa', 'count', 'b')
180 self.checkequalnofix(3, 'aaa', 'count', 'a')
181 self.checkequalnofix(0, 'aaa', 'count', 'b')
182 self.checkequalnofix(0, 'aaa', 'count', 'b')
183 self.checkequalnofix(1, 'aaa', 'count', 'a', -1)
184 self.checkequalnofix(3, 'aaa', 'count', 'a', -10)
185 self.checkequalnofix(2, 'aaa', 'count', 'a', 0, -1)
186 self.checkequalnofix(0, 'aaa', 'count', 'a', 0, -10)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200187 # test mixed kinds
188 self.checkequal(10, '\u0102' + 'a' * 10, 'count', 'a')
189 self.checkequal(10, '\U00100304' + 'a' * 10, 'count', 'a')
190 self.checkequal(10, '\U00100304' + '\u0102' * 10, 'count', '\u0102')
191 self.checkequal(0, 'a' * 10, 'count', '\u0102')
192 self.checkequal(0, 'a' * 10, 'count', '\U00100304')
193 self.checkequal(0, '\u0102' * 10, 'count', '\U00100304')
194 self.checkequal(10, '\u0102' + 'a_' * 10, 'count', 'a_')
195 self.checkequal(10, '\U00100304' + 'a_' * 10, 'count', 'a_')
196 self.checkequal(10, '\U00100304' + '\u0102_' * 10, 'count', '\u0102_')
197 self.checkequal(0, 'a' * 10, 'count', 'a\u0102')
198 self.checkequal(0, 'a' * 10, 'count', 'a\U00100304')
199 self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000200
Walter Dörwald28256f22003-01-19 16:59:20 +0000201 def test_find(self):
Antoine Pitrouc0bbe7d2011-10-08 22:41:35 +0200202 string_tests.CommonTest.test_find(self)
Antoine Pitrou2c3b2302011-10-11 20:29:21 +0200203 # test implementation details of the memchr fast path
204 self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102')
205 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201')
206 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120')
207 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220')
208 self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304')
209 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204')
210 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004')
211 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000212 self.checkequalnofix(0, 'abcdefghiabc', 'find', 'abc')
213 self.checkequalnofix(9, 'abcdefghiabc', 'find', 'abc', 1)
214 self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000215
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000216 self.assertRaises(TypeError, 'hello'.find)
217 self.assertRaises(TypeError, 'hello'.find, 42)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200218 # test mixed kinds
219 self.checkequal(100, '\u0102' * 100 + 'a', 'find', 'a')
220 self.checkequal(100, '\U00100304' * 100 + 'a', 'find', 'a')
221 self.checkequal(100, '\U00100304' * 100 + '\u0102', 'find', '\u0102')
222 self.checkequal(-1, 'a' * 100, 'find', '\u0102')
223 self.checkequal(-1, 'a' * 100, 'find', '\U00100304')
224 self.checkequal(-1, '\u0102' * 100, 'find', '\U00100304')
225 self.checkequal(100, '\u0102' * 100 + 'a_', 'find', 'a_')
226 self.checkequal(100, '\U00100304' * 100 + 'a_', 'find', 'a_')
227 self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'find', '\u0102_')
228 self.checkequal(-1, 'a' * 100, 'find', 'a\u0102')
229 self.checkequal(-1, 'a' * 100, 'find', 'a\U00100304')
230 self.checkequal(-1, '\u0102' * 100, 'find', '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000231
Walter Dörwald28256f22003-01-19 16:59:20 +0000232 def test_rfind(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000233 string_tests.CommonTest.test_rfind(self)
Antoine Pitrou2c3b2302011-10-11 20:29:21 +0200234 # test implementation details of the memrchr fast path
235 self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102')
236 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201')
237 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120')
238 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220')
239 self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304')
240 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204')
241 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000242 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000243 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', 'abc')
244 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
245 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200246 # test mixed kinds
247 self.checkequal(0, 'a' + '\u0102' * 100, 'rfind', 'a')
248 self.checkequal(0, 'a' + '\U00100304' * 100, 'rfind', 'a')
249 self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rfind', '\u0102')
250 self.checkequal(-1, 'a' * 100, 'rfind', '\u0102')
251 self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304')
252 self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304')
253 self.checkequal(0, '_a' + '\u0102' * 100, 'rfind', '_a')
254 self.checkequal(0, '_a' + '\U00100304' * 100, 'rfind', '_a')
255 self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rfind', '_\u0102')
256 self.checkequal(-1, 'a' * 100, 'rfind', '\u0102a')
257 self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304a')
258 self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304\u0102')
Guido van Rossum8b264542000-12-19 02:22:31 +0000259
Walter Dörwald28256f22003-01-19 16:59:20 +0000260 def test_index(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000261 string_tests.CommonTest.test_index(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000262 self.checkequalnofix(0, 'abcdefghiabc', 'index', '')
263 self.checkequalnofix(3, 'abcdefghiabc', 'index', 'def')
264 self.checkequalnofix(0, 'abcdefghiabc', 'index', 'abc')
265 self.checkequalnofix(9, 'abcdefghiabc', 'index', 'abc', 1)
266 self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
267 self.assertRaises(ValueError, 'abcdefghiab'.index, 'abc', 1)
268 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', 8)
269 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', -1)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200270 # test mixed kinds
271 self.checkequal(100, '\u0102' * 100 + 'a', 'index', 'a')
272 self.checkequal(100, '\U00100304' * 100 + 'a', 'index', 'a')
273 self.checkequal(100, '\U00100304' * 100 + '\u0102', 'index', '\u0102')
274 self.assertRaises(ValueError, ('a' * 100).index, '\u0102')
275 self.assertRaises(ValueError, ('a' * 100).index, '\U00100304')
276 self.assertRaises(ValueError, ('\u0102' * 100).index, '\U00100304')
277 self.checkequal(100, '\u0102' * 100 + 'a_', 'index', 'a_')
278 self.checkequal(100, '\U00100304' * 100 + 'a_', 'index', 'a_')
279 self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'index', '\u0102_')
280 self.assertRaises(ValueError, ('a' * 100).index, 'a\u0102')
281 self.assertRaises(ValueError, ('a' * 100).index, 'a\U00100304')
282 self.assertRaises(ValueError, ('\u0102' * 100).index, '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000283
Walter Dörwald28256f22003-01-19 16:59:20 +0000284 def test_rindex(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000285 string_tests.CommonTest.test_rindex(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000286 self.checkequalnofix(12, 'abcdefghiabc', 'rindex', '')
287 self.checkequalnofix(3, 'abcdefghiabc', 'rindex', 'def')
288 self.checkequalnofix(9, 'abcdefghiabc', 'rindex', 'abc')
289 self.checkequalnofix(0, 'abcdefghiabc', 'rindex', 'abc', 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000290
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000291 self.assertRaises(ValueError, 'abcdefghiabc'.rindex, 'hib')
292 self.assertRaises(ValueError, 'defghiabc'.rindex, 'def', 1)
293 self.assertRaises(ValueError, 'defghiabc'.rindex, 'abc', 0, -1)
294 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, 8)
295 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, -1)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200296 # test mixed kinds
297 self.checkequal(0, 'a' + '\u0102' * 100, 'rindex', 'a')
298 self.checkequal(0, 'a' + '\U00100304' * 100, 'rindex', 'a')
299 self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rindex', '\u0102')
300 self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102')
301 self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304')
302 self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304')
303 self.checkequal(0, '_a' + '\u0102' * 100, 'rindex', '_a')
304 self.checkequal(0, '_a' + '\U00100304' * 100, 'rindex', '_a')
305 self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rindex', '_\u0102')
306 self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102a')
307 self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304a')
308 self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304\u0102')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000309
Georg Brandlceee0772007-11-27 23:48:05 +0000310 def test_maketrans_translate(self):
311 # these work with plain translate()
312 self.checkequalnofix('bbbc', 'abababc', 'translate',
313 {ord('a'): None})
314 self.checkequalnofix('iiic', 'abababc', 'translate',
315 {ord('a'): None, ord('b'): ord('i')})
316 self.checkequalnofix('iiix', 'abababc', 'translate',
317 {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
318 self.checkequalnofix('c', 'abababc', 'translate',
319 {ord('a'): None, ord('b'): ''})
320 self.checkequalnofix('xyyx', 'xzx', 'translate',
321 {ord('z'): 'yy'})
Victor Stinner5a29f252014-04-05 00:17:51 +0200322
Georg Brandlceee0772007-11-27 23:48:05 +0000323 # this needs maketrans()
324 self.checkequalnofix('abababc', 'abababc', 'translate',
325 {'b': '<i>'})
326 tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
327 self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
328 # test alternative way of calling maketrans()
329 tbl = self.type2test.maketrans('abc', 'xyz', 'd')
330 self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
331
Victor Stinner5a29f252014-04-05 00:17:51 +0200332 # various tests switching from ASCII to latin1 or the opposite;
333 # same length, remove a letter, or replace with a longer string.
334 self.assertEqual("[a]".translate(str.maketrans('a', 'X')),
335 "[X]")
336 self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})),
337 "[X]")
338 self.assertEqual("[a]".translate(str.maketrans({'a': None})),
339 "[]")
340 self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})),
341 "[XXX]")
342 self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})),
343 "[\xe9]")
344 self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})),
345 "[<\xe9>]")
346 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})),
347 "[a]")
348 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})),
349 "[]")
350
Victor Stinner4ff33af2014-04-05 11:56:37 +0200351 # invalid Unicode characters
352 invalid_char = 0x10ffff+1
353 for before in "a\xe9\u20ac\U0010ffff":
354 mapping = str.maketrans({before: invalid_char})
355 text = "[%s]" % before
356 self.assertRaises(ValueError, text.translate, mapping)
357
358 # errors
Georg Brandlceee0772007-11-27 23:48:05 +0000359 self.assertRaises(TypeError, self.type2test.maketrans)
360 self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
361 self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
362 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
363 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
364 self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
365 self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000366
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000367 self.assertRaises(TypeError, 'hello'.translate)
Walter Dörwald67e83882007-05-05 12:26:27 +0000368 self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000369
Walter Dörwald28256f22003-01-19 16:59:20 +0000370 def test_split(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000371 string_tests.CommonTest.test_split(self)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000372
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000373 # Mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000374 self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
375 self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
376 self.checkequalnofix(['endcase ', ''], 'endcase test', 'split', 'test')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200377 # test mixed kinds
378 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
379 left *= 9
380 right *= 9
381 for delim in ('c', '\u0102', '\U00010302'):
382 self.checkequal([left + right],
383 left + right, 'split', delim)
384 self.checkequal([left, right],
385 left + delim + right, 'split', delim)
386 self.checkequal([left + right],
387 left + right, 'split', delim * 2)
388 self.checkequal([left, right],
389 left + delim * 2 + right, 'split', delim *2)
390
391 def test_rsplit(self):
392 string_tests.CommonTest.test_rsplit(self)
393 # test mixed kinds
394 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
395 left *= 9
396 right *= 9
397 for delim in ('c', '\u0102', '\U00010302'):
398 self.checkequal([left + right],
399 left + right, 'rsplit', delim)
400 self.checkequal([left, right],
401 left + delim + right, 'rsplit', delim)
402 self.checkequal([left + right],
403 left + right, 'rsplit', delim * 2)
404 self.checkequal([left, right],
405 left + delim * 2 + right, 'rsplit', delim *2)
406
407 def test_partition(self):
408 string_tests.MixinStrUnicodeUserStringTest.test_partition(self)
409 # test mixed kinds
Serhiy Storchaka48070c12015-03-29 19:21:02 +0300410 self.checkequal(('ABCDEFGH', '', ''), 'ABCDEFGH', 'partition', '\u4200')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200411 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
412 left *= 9
413 right *= 9
414 for delim in ('c', '\u0102', '\U00010302'):
415 self.checkequal((left + right, '', ''),
416 left + right, 'partition', delim)
417 self.checkequal((left, delim, right),
418 left + delim + right, 'partition', delim)
419 self.checkequal((left + right, '', ''),
420 left + right, 'partition', delim * 2)
421 self.checkequal((left, delim * 2, right),
422 left + delim * 2 + right, 'partition', delim * 2)
423
424 def test_rpartition(self):
425 string_tests.MixinStrUnicodeUserStringTest.test_rpartition(self)
426 # test mixed kinds
Serhiy Storchaka48070c12015-03-29 19:21:02 +0300427 self.checkequal(('', '', 'ABCDEFGH'), 'ABCDEFGH', 'rpartition', '\u4200')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200428 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
429 left *= 9
430 right *= 9
431 for delim in ('c', '\u0102', '\U00010302'):
432 self.checkequal(('', '', left + right),
433 left + right, 'rpartition', delim)
434 self.checkequal((left, delim, right),
435 left + delim + right, 'rpartition', delim)
436 self.checkequal(('', '', left + right),
437 left + right, 'rpartition', delim * 2)
438 self.checkequal((left, delim * 2, right),
439 left + delim * 2 + right, 'rpartition', delim * 2)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000440
Walter Dörwald28256f22003-01-19 16:59:20 +0000441 def test_join(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000442 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000443
Guido van Rossumf1044292007-09-27 18:01:22 +0000444 class MyWrapper:
445 def __init__(self, sval): self.sval = sval
446 def __str__(self): return self.sval
447
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000448 # mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000449 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
450 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
451 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
452 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
453 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
454 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
455 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
Guido van Rossum98297ee2007-11-06 21:34:58 +0000456 self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
457 self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
458 self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
459 self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
Marc-André Lemburge5034372000-08-08 08:04:29 +0000460
Walter Dörwald28256f22003-01-19 16:59:20 +0000461 def test_replace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000462 string_tests.CommonTest.test_replace(self)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000463
Walter Dörwald28256f22003-01-19 16:59:20 +0000464 # method call forwarded from str implementation because of unicode argument
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000465 self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
466 self.assertRaises(TypeError, 'replace'.replace, "r", 42)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200467 # test mixed kinds
468 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
469 left *= 9
470 right *= 9
471 for delim in ('c', '\u0102', '\U00010302'):
472 for repl in ('d', '\u0103', '\U00010303'):
473 self.checkequal(left + right,
474 left + right, 'replace', delim, repl)
475 self.checkequal(left + repl + right,
476 left + delim + right,
477 'replace', delim, repl)
478 self.checkequal(left + right,
479 left + right, 'replace', delim * 2, repl)
480 self.checkequal(left + repl + right,
481 left + delim * 2 + right,
482 'replace', delim * 2, repl)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000483
Victor Stinner59de0ee2011-10-07 10:01:28 +0200484 @support.cpython_only
485 def test_replace_id(self):
Victor Stinner1d972ad2011-10-07 13:31:46 +0200486 pattern = 'abc'
487 text = 'abc def'
488 self.assertIs(text.replace(pattern, pattern), text)
Victor Stinner59de0ee2011-10-07 10:01:28 +0200489
Guido van Rossum98297ee2007-11-06 21:34:58 +0000490 def test_bytes_comparison(self):
Brett Cannon226b2302010-03-20 22:22:22 +0000491 with support.check_warnings():
492 warnings.simplefilter('ignore', BytesWarning)
493 self.assertEqual('abc' == b'abc', False)
494 self.assertEqual('abc' != b'abc', True)
495 self.assertEqual('abc' == bytearray(b'abc'), False)
496 self.assertEqual('abc' != bytearray(b'abc'), True)
Brett Cannon40430012007-10-22 20:24:51 +0000497
Walter Dörwald28256f22003-01-19 16:59:20 +0000498 def test_comparison(self):
499 # Comparisons:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000500 self.assertEqual('abc', 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000501 self.assertTrue('abcd' > 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000502 self.assertTrue('abc' < 'abcd')
Walter Dörwald28256f22003-01-19 16:59:20 +0000503
504 if 0:
505 # Move these tests to a Unicode collation module test...
506 # Testing UTF-16 code point order comparisons...
507
508 # No surrogates, no fixup required.
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000509 self.assertTrue('\u0061' < '\u20ac')
Walter Dörwald28256f22003-01-19 16:59:20 +0000510 # Non surrogate below surrogate value, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000511 self.assertTrue('\u0061' < '\ud800\udc02')
Walter Dörwald28256f22003-01-19 16:59:20 +0000512
513 # Non surrogate above surrogate value, fixup required
514 def test_lecmp(s, s2):
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000515 self.assertTrue(s < s2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000516
517 def test_fixup(s):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000518 s2 = '\ud800\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000519 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000520 s2 = '\ud900\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000521 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000522 s2 = '\uda00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000523 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000524 s2 = '\udb00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000525 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000526 s2 = '\ud800\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000527 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000528 s2 = '\ud900\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000529 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000530 s2 = '\uda00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000531 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000532 s2 = '\udb00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000533 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000534 s2 = '\ud800\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000535 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000536 s2 = '\ud900\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000537 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000538 s2 = '\uda00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000539 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000540 s2 = '\udb00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000541 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000542 s2 = '\ud800\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000543 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000544 s2 = '\ud900\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000545 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000546 s2 = '\uda00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000547 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000548 s2 = '\udb00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000549 test_lecmp(s, s2)
550
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000551 test_fixup('\ue000')
552 test_fixup('\uff61')
Walter Dörwald28256f22003-01-19 16:59:20 +0000553
554 # Surrogates on both sides, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000555 self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
Walter Dörwald28256f22003-01-19 16:59:20 +0000556
Walter Dörwald28256f22003-01-19 16:59:20 +0000557 def test_islower(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000558 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000559 self.checkequalnofix(False, '\u1FFc', 'islower')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500560 self.assertFalse('\u2167'.islower())
561 self.assertTrue('\u2177'.islower())
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300562 # non-BMP, uppercase
563 self.assertFalse('\U00010401'.islower())
564 self.assertFalse('\U00010427'.islower())
565 # non-BMP, lowercase
566 self.assertTrue('\U00010429'.islower())
567 self.assertTrue('\U0001044E'.islower())
568 # non-BMP, non-cased
569 self.assertFalse('\U0001F40D'.islower())
570 self.assertFalse('\U0001F46F'.islower())
Walter Dörwald28256f22003-01-19 16:59:20 +0000571
572 def test_isupper(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000573 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
574 if not sys.platform.startswith('java'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000575 self.checkequalnofix(False, '\u1FFc', 'isupper')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500576 self.assertTrue('\u2167'.isupper())
577 self.assertFalse('\u2177'.isupper())
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300578 # non-BMP, uppercase
579 self.assertTrue('\U00010401'.isupper())
580 self.assertTrue('\U00010427'.isupper())
581 # non-BMP, lowercase
582 self.assertFalse('\U00010429'.isupper())
583 self.assertFalse('\U0001044E'.isupper())
584 # non-BMP, non-cased
585 self.assertFalse('\U0001F40D'.isupper())
586 self.assertFalse('\U0001F46F'.isupper())
Walter Dörwald28256f22003-01-19 16:59:20 +0000587
588 def test_istitle(self):
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300589 string_tests.MixinStrUnicodeUserStringTest.test_istitle(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000590 self.checkequalnofix(True, '\u1FFc', 'istitle')
591 self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
Walter Dörwald28256f22003-01-19 16:59:20 +0000592
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300593 # non-BMP, uppercase + lowercase
594 self.assertTrue('\U00010401\U00010429'.istitle())
595 self.assertTrue('\U00010427\U0001044E'.istitle())
596 # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
597 for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
598 self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
599
Walter Dörwald28256f22003-01-19 16:59:20 +0000600 def test_isspace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000601 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000602 self.checkequalnofix(True, '\u2000', 'isspace')
603 self.checkequalnofix(True, '\u200a', 'isspace')
604 self.checkequalnofix(False, '\u2014', 'isspace')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300605 # apparently there are no non-BMP spaces chars in Unicode 6
606 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
607 '\U0001F40D', '\U0001F46F']:
608 self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
609
610 def test_isalnum(self):
611 string_tests.MixinStrUnicodeUserStringTest.test_isalnum(self)
612 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
613 '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
614 self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
Walter Dörwald28256f22003-01-19 16:59:20 +0000615
616 def test_isalpha(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000617 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000618 self.checkequalnofix(True, '\u1FFc', 'isalpha')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300619 # non-BMP, cased
620 self.assertTrue('\U00010401'.isalpha())
621 self.assertTrue('\U00010427'.isalpha())
622 self.assertTrue('\U00010429'.isalpha())
623 self.assertTrue('\U0001044E'.isalpha())
624 # non-BMP, non-cased
625 self.assertFalse('\U0001F40D'.isalpha())
626 self.assertFalse('\U0001F46F'.isalpha())
Walter Dörwald28256f22003-01-19 16:59:20 +0000627
628 def test_isdecimal(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000629 self.checkequalnofix(False, '', 'isdecimal')
630 self.checkequalnofix(False, 'a', 'isdecimal')
631 self.checkequalnofix(True, '0', 'isdecimal')
632 self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
633 self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
634 self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
635 self.checkequalnofix(True, '0123456789', 'isdecimal')
636 self.checkequalnofix(False, '0123456789a', 'isdecimal')
Walter Dörwald28256f22003-01-19 16:59:20 +0000637
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000638 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000639
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300640 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
641 '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
642 self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
643 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
644 self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
645
Walter Dörwald28256f22003-01-19 16:59:20 +0000646 def test_isdigit(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000647 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000648 self.checkequalnofix(True, '\u2460', 'isdigit')
649 self.checkequalnofix(False, '\xbc', 'isdigit')
650 self.checkequalnofix(True, '\u0660', 'isdigit')
Walter Dörwald28256f22003-01-19 16:59:20 +0000651
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300652 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
653 '\U0001F40D', '\U0001F46F', '\U00011065']:
654 self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
655 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
656 self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
657
Walter Dörwald28256f22003-01-19 16:59:20 +0000658 def test_isnumeric(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000659 self.checkequalnofix(False, '', 'isnumeric')
660 self.checkequalnofix(False, 'a', 'isnumeric')
661 self.checkequalnofix(True, '0', 'isnumeric')
662 self.checkequalnofix(True, '\u2460', 'isnumeric')
663 self.checkequalnofix(True, '\xbc', 'isnumeric')
664 self.checkequalnofix(True, '\u0660', 'isnumeric')
665 self.checkequalnofix(True, '0123456789', 'isnumeric')
666 self.checkequalnofix(False, '0123456789a', 'isnumeric')
Walter Dörwald28256f22003-01-19 16:59:20 +0000667
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000668 self.assertRaises(TypeError, "abc".isnumeric, 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000669
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300670 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
671 '\U0001F40D', '\U0001F46F']:
672 self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
673 for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
674 '\U000104A0', '\U0001F107']:
675 self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
676
Martin v. Löwis47383402007-08-15 07:32:56 +0000677 def test_isidentifier(self):
678 self.assertTrue("a".isidentifier())
679 self.assertTrue("Z".isidentifier())
680 self.assertTrue("_".isidentifier())
681 self.assertTrue("b0".isidentifier())
682 self.assertTrue("bc".isidentifier())
683 self.assertTrue("b_".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000684 self.assertTrue("µ".isidentifier())
Benjamin Petersonf413b802011-08-12 22:17:18 -0500685 self.assertTrue("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000686
687 self.assertFalse(" ".isidentifier())
688 self.assertFalse("[".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000689 self.assertFalse("©".isidentifier())
Georg Brandld52429f2008-07-04 15:55:02 +0000690 self.assertFalse("0".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000691
Georg Brandl559e5d72008-06-11 18:37:52 +0000692 def test_isprintable(self):
693 self.assertTrue("".isprintable())
Benjamin Peterson09832742009-03-26 17:15:46 +0000694 self.assertTrue(" ".isprintable())
Georg Brandl559e5d72008-06-11 18:37:52 +0000695 self.assertTrue("abcdefg".isprintable())
696 self.assertFalse("abcdefg\n".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000697 # some defined Unicode character
698 self.assertTrue("\u0374".isprintable())
699 # undefined character
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +0000700 self.assertFalse("\u0378".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000701 # single surrogate character
Georg Brandl559e5d72008-06-11 18:37:52 +0000702 self.assertFalse("\ud800".isprintable())
703
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300704 self.assertTrue('\U0001F46F'.isprintable())
705 self.assertFalse('\U000E0020'.isprintable())
706
707 def test_surrogates(self):
708 for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
709 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
710 self.assertTrue(s.islower())
711 self.assertFalse(s.isupper())
712 self.assertFalse(s.istitle())
713 for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
714 'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
715 self.assertFalse(s.islower())
716 self.assertTrue(s.isupper())
717 self.assertTrue(s.istitle())
718
719 for meth_name in ('islower', 'isupper', 'istitle'):
720 meth = getattr(str, meth_name)
721 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
722 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
723
724 for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
725 'isdecimal', 'isnumeric',
726 'isidentifier', 'isprintable'):
727 meth = getattr(str, meth_name)
728 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
729 'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
730 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
731 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
732
733
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300734 def test_lower(self):
735 string_tests.CommonTest.test_lower(self)
736 self.assertEqual('\U00010427'.lower(), '\U0001044F')
737 self.assertEqual('\U00010427\U00010427'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300738 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300739 self.assertEqual('\U00010427\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300740 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300741 self.assertEqual('X\U00010427x\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300742 'x\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500743 self.assertEqual('fi'.lower(), 'fi')
744 self.assertEqual('\u0130'.lower(), '\u0069\u0307')
745 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
746 self.assertEqual('\u03a3'.lower(), '\u03c3')
747 self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
748 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
749 self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
750 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
751 self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
752 self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
753 self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
754 self.assertEqual('\u2177'.lower(), '\u2177')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300755
Benjamin Petersond5890c82012-01-14 13:23:30 -0500756 def test_casefold(self):
757 self.assertEqual('hello'.casefold(), 'hello')
758 self.assertEqual('hELlo'.casefold(), 'hello')
759 self.assertEqual('ß'.casefold(), 'ss')
760 self.assertEqual('fi'.casefold(), 'fi')
761 self.assertEqual('\u03a3'.casefold(), '\u03c3')
762 self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
Benjamin Peterson4eda9372012-08-05 15:05:34 -0700763 self.assertEqual('\u00b5'.casefold(), '\u03bc')
Benjamin Petersond5890c82012-01-14 13:23:30 -0500764
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300765 def test_upper(self):
766 string_tests.CommonTest.test_upper(self)
767 self.assertEqual('\U0001044F'.upper(), '\U00010427')
768 self.assertEqual('\U0001044F\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300769 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300770 self.assertEqual('\U00010427\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300771 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300772 self.assertEqual('X\U00010427x\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300773 'X\U00010427X\U00010427')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500774 self.assertEqual('fi'.upper(), 'FI')
775 self.assertEqual('\u0130'.upper(), '\u0130')
776 self.assertEqual('\u03a3'.upper(), '\u03a3')
777 self.assertEqual('ß'.upper(), 'SS')
778 self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
779 self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
780 self.assertEqual('\u2177'.upper(), '\u2167')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300781
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300782 def test_capitalize(self):
783 string_tests.CommonTest.test_capitalize(self)
784 self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
785 self.assertEqual('\U0001044F\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300786 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300787 self.assertEqual('\U00010427\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300788 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300789 self.assertEqual('\U0001044F\U00010427'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300790 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300791 self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300792 'X\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500793 self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
794 exp = '\u0399\u0308\u0300\u0069\u0307'
795 self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
796 self.assertEqual('finnish'.capitalize(), 'FInnish')
797 self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300798
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300799 def test_title(self):
800 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
801 self.assertEqual('\U0001044F'.title(), '\U00010427')
802 self.assertEqual('\U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300803 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300804 self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300805 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300806 self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300807 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300808 self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300809 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300810 self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300811 'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500812 self.assertEqual('fiNNISH'.title(), 'Finnish')
813 self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
814 self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300815
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300816 def test_swapcase(self):
817 string_tests.CommonTest.test_swapcase(self)
818 self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
819 self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
820 self.assertEqual('\U0001044F\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300821 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300822 self.assertEqual('\U00010427\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300823 '\U0001044F\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300824 self.assertEqual('\U0001044F\U00010427'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300825 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300826 self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300827 'x\U0001044FX\U00010427')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500828 self.assertEqual('fi'.swapcase(), 'FI')
829 self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
830 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
831 self.assertEqual('\u03a3'.swapcase(), '\u03c3')
832 self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
833 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
834 self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
835 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
836 self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
837 self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
838 self.assertEqual('\u03a3'.swapcase(), '\u03c3')
839 self.assertEqual('ß'.swapcase(), 'SS')
840 self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300841
Ezio Melottif84e01d2013-07-08 17:48:29 +0200842 def test_center(self):
843 string_tests.CommonTest.test_center(self)
844 self.assertEqual('x'.center(2, '\U0010FFFF'),
845 'x\U0010FFFF')
846 self.assertEqual('x'.center(3, '\U0010FFFF'),
847 '\U0010FFFFx\U0010FFFF')
848 self.assertEqual('x'.center(4, '\U0010FFFF'),
849 '\U0010FFFFx\U0010FFFF\U0010FFFF')
850
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400851 @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system")
Benjamin Peterson4d856892014-10-15 13:39:46 -0400852 @support.cpython_only
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400853 def test_case_operation_overflow(self):
854 # Issue #22643
Serhiy Storchaka411dfd82015-11-07 16:54:48 +0200855 size = 2**32//12 + 1
856 try:
857 s = "ü" * size
858 except MemoryError:
859 self.skipTest('no enough memory (%.0f MiB required)' % (size / 2**20))
860 try:
861 self.assertRaises(OverflowError, s.upper)
862 finally:
863 del s
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400864
Walter Dörwald28256f22003-01-19 16:59:20 +0000865 def test_contains(self):
866 # Testing Unicode contains method
Benjamin Peterson577473f2010-01-19 00:09:57 +0000867 self.assertIn('a', 'abdb')
868 self.assertIn('a', 'bdab')
869 self.assertIn('a', 'bdaba')
870 self.assertIn('a', 'bdba')
871 self.assertNotIn('a', 'bdb')
872 self.assertIn('a', 'bdba')
873 self.assertIn('a', ('a',1,None))
874 self.assertIn('a', (1,None,'a'))
875 self.assertIn('a', ('a',1,None))
876 self.assertIn('a', (1,None,'a'))
877 self.assertNotIn('a', ('x',1,'y'))
878 self.assertNotIn('a', ('x',1,None))
879 self.assertNotIn('abcd', 'abcxxxx')
880 self.assertIn('ab', 'abcd')
881 self.assertIn('ab', 'abc')
882 self.assertIn('ab', (1,None,'ab'))
883 self.assertIn('', 'abc')
884 self.assertIn('', '')
885 self.assertIn('', 'abc')
886 self.assertNotIn('\0', 'abc')
887 self.assertIn('\0', '\0abc')
888 self.assertIn('\0', 'abc\0')
889 self.assertIn('a', '\0abc')
890 self.assertIn('asdf', 'asdf')
891 self.assertNotIn('asdf', 'asd')
892 self.assertNotIn('asdf', '')
Walter Dörwald28256f22003-01-19 16:59:20 +0000893
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000894 self.assertRaises(TypeError, "abc".__contains__)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200895 # test mixed kinds
896 for fill in ('a', '\u0100', '\U00010300'):
897 fill *= 9
898 for delim in ('c', '\u0102', '\U00010302'):
899 self.assertNotIn(delim, fill)
900 self.assertIn(delim, fill + delim)
901 self.assertNotIn(delim * 2, fill)
902 self.assertIn(delim * 2, fill + delim * 2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000903
Serhiy Storchaka31b1c8b2013-06-12 09:20:44 +0300904 def test_issue18183(self):
905 '\U00010000\U00100000'.lower()
906 '\U00010000\U00100000'.casefold()
907 '\U00010000\U00100000'.upper()
908 '\U00010000\U00100000'.capitalize()
909 '\U00010000\U00100000'.title()
910 '\U00010000\U00100000'.swapcase()
911 '\U00100000'.center(3, '\U00010000')
912 '\U00100000'.ljust(3, '\U00010000')
913 '\U00100000'.rjust(3, '\U00010000')
914
Eric Smith8c663262007-08-25 02:26:07 +0000915 def test_format(self):
916 self.assertEqual(''.format(), '')
917 self.assertEqual('a'.format(), 'a')
918 self.assertEqual('ab'.format(), 'ab')
919 self.assertEqual('a{{'.format(), 'a{')
920 self.assertEqual('a}}'.format(), 'a}')
921 self.assertEqual('{{b'.format(), '{b')
922 self.assertEqual('}}b'.format(), '}b')
923 self.assertEqual('a{{b'.format(), 'a{b')
924
925 # examples from the PEP:
926 import datetime
927 self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
928 self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
929 "My name is Fred")
930 self.assertEqual("My name is {0} :-{{}}".format('Fred'),
931 "My name is Fred :-{}")
932
933 d = datetime.date(2007, 8, 18)
934 self.assertEqual("The year is {0.year}".format(d),
935 "The year is 2007")
936
Eric Smith8c663262007-08-25 02:26:07 +0000937 # classes we'll use for testing
938 class C:
939 def __init__(self, x=100):
940 self._x = x
941 def __format__(self, spec):
942 return spec
943
944 class D:
945 def __init__(self, x):
946 self.x = x
947 def __format__(self, spec):
948 return str(self.x)
949
950 # class with __str__, but no __format__
951 class E:
952 def __init__(self, x):
953 self.x = x
954 def __str__(self):
955 return 'E(' + self.x + ')'
956
957 # class with __repr__, but no __format__ or __str__
958 class F:
959 def __init__(self, x):
960 self.x = x
961 def __repr__(self):
962 return 'F(' + self.x + ')'
963
964 # class with __format__ that forwards to string, for some format_spec's
965 class G:
966 def __init__(self, x):
967 self.x = x
968 def __str__(self):
969 return "string is " + self.x
970 def __format__(self, format_spec):
971 if format_spec == 'd':
972 return 'G(' + self.x + ')'
973 return object.__format__(self, format_spec)
974
Eric Smith739e2ad2007-08-27 19:07:22 +0000975 class I(datetime.date):
976 def __format__(self, format_spec):
977 return self.strftime(format_spec)
978
Eric Smith185e30c2007-08-30 22:23:08 +0000979 class J(int):
980 def __format__(self, format_spec):
981 return int.__format__(self * 2, format_spec)
982
Eric Smith8c663262007-08-25 02:26:07 +0000983
984 self.assertEqual(''.format(), '')
985 self.assertEqual('abc'.format(), 'abc')
986 self.assertEqual('{0}'.format('abc'), 'abc')
987 self.assertEqual('{0:}'.format('abc'), 'abc')
988# self.assertEqual('{ 0 }'.format('abc'), 'abc')
989 self.assertEqual('X{0}'.format('abc'), 'Xabc')
990 self.assertEqual('{0}X'.format('abc'), 'abcX')
991 self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
992 self.assertEqual('{1}'.format(1, 'abc'), 'abc')
993 self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
994 self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
995 self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
996 self.assertEqual('{0}'.format(-15), '-15')
997 self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
998 self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
999 self.assertEqual('{{'.format(), '{')
1000 self.assertEqual('}}'.format(), '}')
1001 self.assertEqual('{{}}'.format(), '{}')
1002 self.assertEqual('{{x}}'.format(), '{x}')
1003 self.assertEqual('{{{0}}}'.format(123), '{123}')
1004 self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
1005 self.assertEqual('}}{{'.format(), '}{')
1006 self.assertEqual('}}x{{'.format(), '}x{')
1007
Eric Smith7ade6482007-08-26 22:27:13 +00001008 # weird field names
1009 self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
1010 self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
Eric Smith4cb4e4e2007-09-03 08:40:29 +00001011 self.assertEqual("{0[ ]}".format({' ':3}), '3')
Eric Smith7ade6482007-08-26 22:27:13 +00001012
Eric Smith8c663262007-08-25 02:26:07 +00001013 self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
1014 self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
1015 self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
1016 self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
1017 self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
1018 self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
1019 self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
1020
Eric Smith8c663262007-08-25 02:26:07 +00001021 # strings
1022 self.assertEqual('{0:.3s}'.format('abc'), 'abc')
1023 self.assertEqual('{0:.3s}'.format('ab'), 'ab')
1024 self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
1025 self.assertEqual('{0:.0s}'.format('abcdef'), '')
1026 self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
1027 self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
1028 self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
1029 self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
1030 self.assertEqual('{0:x<0s}'.format('result'), 'result')
1031 self.assertEqual('{0:x<5s}'.format('result'), 'result')
1032 self.assertEqual('{0:x<6s}'.format('result'), 'result')
1033 self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
1034 self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
1035 self.assertEqual('{0: <7s}'.format('result'), 'result ')
1036 self.assertEqual('{0:<7s}'.format('result'), 'result ')
1037 self.assertEqual('{0:>7s}'.format('result'), ' result')
1038 self.assertEqual('{0:>8s}'.format('result'), ' result')
1039 self.assertEqual('{0:^8s}'.format('result'), ' result ')
1040 self.assertEqual('{0:^9s}'.format('result'), ' result ')
1041 self.assertEqual('{0:^10s}'.format('result'), ' result ')
1042 self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
1043 self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
1044 self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
1045
Eric V. Smith2ea97122014-04-14 11:55:10 -04001046 # issue 12546: use \x00 as a fill character
1047 self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
1048 self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
1049 self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
1050 self.assertEqual('{0:^6s}'.format('foo'), ' foo ')
1051
1052 self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
1053 self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
1054 self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
1055 self.assertEqual('{0:<6}'.format(3), '3 ')
1056
1057 self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
1058 self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
1059 self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
1060 self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
1061
1062 self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
1063 self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
1064 self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
1065 self.assertEqual('{0:^12}'.format(3+2.0j), ' (3+2j) ')
1066
Eric Smith8c663262007-08-25 02:26:07 +00001067 # format specifiers for user defined type
1068 self.assertEqual('{0:abc}'.format(C()), 'abc')
1069
Georg Brandld52429f2008-07-04 15:55:02 +00001070 # !r, !s and !a coercions
Eric Smith8c663262007-08-25 02:26:07 +00001071 self.assertEqual('{0!s}'.format('Hello'), 'Hello')
1072 self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
1073 self.assertEqual('{0!s:15}'.format('Hello'), 'Hello ')
1074 self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello ')
1075 self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
1076 self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
1077 self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +00001078 self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +00001079 self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'") # printable
1080 self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
Georg Brandl559e5d72008-06-11 18:37:52 +00001081 self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +00001082 self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +00001083 self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
Georg Brandl559e5d72008-06-11 18:37:52 +00001084 self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
1085 self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
Georg Brandld52429f2008-07-04 15:55:02 +00001086 self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
Eric Smith8c663262007-08-25 02:26:07 +00001087
Eric Smith8c663262007-08-25 02:26:07 +00001088 # test fallback to object.__format__
1089 self.assertEqual('{0}'.format({}), '{}')
1090 self.assertEqual('{0}'.format([]), '[]')
1091 self.assertEqual('{0}'.format([1]), '[1]')
Eric Smithe4d63172010-09-13 20:48:43 +00001092
Eric Smith8c663262007-08-25 02:26:07 +00001093 self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
Eric Smith8c663262007-08-25 02:26:07 +00001094 self.assertEqual('{0!s}'.format(G('data')), 'string is data')
1095
Andrew Svetlov2cd8ce42012-12-23 14:27:17 +02001096 self.assertRaises(TypeError, '{0:^10}'.format, E('data'))
1097 self.assertRaises(TypeError, '{0:^10s}'.format, E('data'))
1098 self.assertRaises(TypeError, '{0:>15s}'.format, G('data'))
Eric Smithe4d63172010-09-13 20:48:43 +00001099
Eric Smith739e2ad2007-08-27 19:07:22 +00001100 self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
1101 month=8,
1102 day=27)),
1103 "date: 2007-08-27")
1104
Eric Smith185e30c2007-08-30 22:23:08 +00001105 # test deriving from a builtin type and overriding __format__
1106 self.assertEqual("{0}".format(J(10)), "20")
1107
1108
Eric Smith8c663262007-08-25 02:26:07 +00001109 # string format specifiers
1110 self.assertEqual('{0:}'.format('a'), 'a')
1111
1112 # computed format specifiers
1113 self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
1114 self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
1115 self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
1116 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello ')
1117 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello ')
1118
1119 # test various errors
1120 self.assertRaises(ValueError, '{'.format)
1121 self.assertRaises(ValueError, '}'.format)
1122 self.assertRaises(ValueError, 'a{'.format)
1123 self.assertRaises(ValueError, 'a}'.format)
1124 self.assertRaises(ValueError, '{a'.format)
1125 self.assertRaises(ValueError, '}a'.format)
Eric Smith11529192007-09-04 23:04:22 +00001126 self.assertRaises(IndexError, '{0}'.format)
1127 self.assertRaises(IndexError, '{1}'.format, 'abc')
1128 self.assertRaises(KeyError, '{x}'.format)
Eric Smith8c663262007-08-25 02:26:07 +00001129 self.assertRaises(ValueError, "}{".format)
Eric Smith8c663262007-08-25 02:26:07 +00001130 self.assertRaises(ValueError, "abc{0:{}".format)
1131 self.assertRaises(ValueError, "{0".format)
Eric Smith11529192007-09-04 23:04:22 +00001132 self.assertRaises(IndexError, "{0.}".format)
1133 self.assertRaises(ValueError, "{0.}".format, 0)
Benjamin Peterson4d944742013-05-17 18:22:31 -05001134 self.assertRaises(ValueError, "{0[}".format)
Eric Smith4cb4e4e2007-09-03 08:40:29 +00001135 self.assertRaises(ValueError, "{0[}".format, [])
Eric Smith11529192007-09-04 23:04:22 +00001136 self.assertRaises(KeyError, "{0]}".format)
1137 self.assertRaises(ValueError, "{0.[]}".format, 0)
Eric Smith7ade6482007-08-26 22:27:13 +00001138 self.assertRaises(ValueError, "{0..foo}".format, 0)
Eric Smith11529192007-09-04 23:04:22 +00001139 self.assertRaises(ValueError, "{0[0}".format, 0)
1140 self.assertRaises(ValueError, "{0[0:foo}".format, 0)
1141 self.assertRaises(KeyError, "{c]}".format)
1142 self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
1143 self.assertRaises(ValueError, "{0}}".format, 0)
1144 self.assertRaises(KeyError, "{foo}".format, bar=3)
Eric Smith8c663262007-08-25 02:26:07 +00001145 self.assertRaises(ValueError, "{0!x}".format, 3)
Eric Smith11529192007-09-04 23:04:22 +00001146 self.assertRaises(ValueError, "{0!}".format, 0)
1147 self.assertRaises(ValueError, "{0!rs}".format, 0)
Eric Smith8c663262007-08-25 02:26:07 +00001148 self.assertRaises(ValueError, "{!}".format)
Eric Smith8ec90442009-03-14 12:29:34 +00001149 self.assertRaises(IndexError, "{:}".format)
1150 self.assertRaises(IndexError, "{:s}".format)
1151 self.assertRaises(IndexError, "{}".format)
Benjamin Peterson59a1b2f2010-06-07 22:31:26 +00001152 big = "23098475029384702983476098230754973209482573"
1153 self.assertRaises(ValueError, ("{" + big + "}").format)
1154 self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
Eric Smith8c663262007-08-25 02:26:07 +00001155
Eric Smith41669ca2009-05-23 14:23:22 +00001156 # issue 6089
1157 self.assertRaises(ValueError, "{0[0]x}".format, [None])
1158 self.assertRaises(ValueError, "{0[0](10)}".format, [None])
1159
Eric Smith8c663262007-08-25 02:26:07 +00001160 # can't have a replacement on the field name portion
1161 self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
1162
1163 # exceed maximum recursion depth
1164 self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
1165 self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1166 0, 1, 2, 3, 4, 5, 6, 7)
1167
1168 # string format spec errors
1169 self.assertRaises(ValueError, "{0:-s}".format, '')
1170 self.assertRaises(ValueError, format, "", "-")
1171 self.assertRaises(ValueError, "{0:=s}".format, '')
1172
Eric Smithb1ebcc62008-07-15 13:02:41 +00001173 # Alternate formatting is not supported
1174 self.assertRaises(ValueError, format, '', '#')
1175 self.assertRaises(ValueError, format, '', '#20')
1176
Victor Stinnerece58de2012-04-23 23:36:38 +02001177 # Non-ASCII
1178 self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"),
1179 'ABC\u0410\u0411\u0412')
1180 self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"),
1181 'ABC')
1182 self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"),
1183 '')
1184
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001185 self.assertEqual("{[{}]}".format({"{}": 5}), "5")
Benjamin Peterson4d944742013-05-17 18:22:31 -05001186 self.assertEqual("{[{}]}".format({"{}" : "a"}), "a")
1187 self.assertEqual("{[{]}".format({"{" : "a"}), "a")
1188 self.assertEqual("{[}]}".format({"}" : "a"}), "a")
1189 self.assertEqual("{[[]}".format({"[" : "a"}), "a")
1190 self.assertEqual("{[!]}".format({"!" : "a"}), "a")
1191 self.assertRaises(ValueError, "{a{}b}".format, 42)
1192 self.assertRaises(ValueError, "{a{b}".format, 42)
1193 self.assertRaises(ValueError, "{[}".format, 42)
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001194
Benjamin Peterson0ee22bf2013-11-26 19:22:36 -06001195 self.assertEqual("0x{:0{:d}X}".format(0x0,16), "0x0000000000000000")
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001196
Eric Smith27bbca62010-11-04 17:06:58 +00001197 def test_format_map(self):
1198 self.assertEqual(''.format_map({}), '')
1199 self.assertEqual('a'.format_map({}), 'a')
1200 self.assertEqual('ab'.format_map({}), 'ab')
1201 self.assertEqual('a{{'.format_map({}), 'a{')
1202 self.assertEqual('a}}'.format_map({}), 'a}')
1203 self.assertEqual('{{b'.format_map({}), '{b')
1204 self.assertEqual('}}b'.format_map({}), '}b')
1205 self.assertEqual('a{{b'.format_map({}), 'a{b')
1206
1207 # using mappings
1208 class Mapping(dict):
1209 def __missing__(self, key):
1210 return key
1211 self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
1212 self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
1213
1214 class InternalMapping:
1215 def __init__(self):
1216 self.mapping = {'a': 'hello'}
1217 def __getitem__(self, key):
1218 return self.mapping[key]
1219 self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
1220
1221
Eric Smith27bbca62010-11-04 17:06:58 +00001222 class C:
1223 def __init__(self, x=100):
1224 self._x = x
1225 def __format__(self, spec):
1226 return spec
Eric Smith27bbca62010-11-04 17:06:58 +00001227 self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
1228
1229 # test various errors
Eric V. Smithedbb6ca2012-03-12 15:16:22 -07001230 self.assertRaises(TypeError, ''.format_map)
1231 self.assertRaises(TypeError, 'a'.format_map)
1232
1233 self.assertRaises(ValueError, '{'.format_map, {})
1234 self.assertRaises(ValueError, '}'.format_map, {})
1235 self.assertRaises(ValueError, 'a{'.format_map, {})
1236 self.assertRaises(ValueError, 'a}'.format_map, {})
1237 self.assertRaises(ValueError, '{a'.format_map, {})
1238 self.assertRaises(ValueError, '}a'.format_map, {})
Eric Smith27bbca62010-11-04 17:06:58 +00001239
Eric V. Smith12ebefc2011-07-18 14:03:41 -04001240 # issue #12579: can't supply positional params to format_map
1241 self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
1242 self.assertRaises(ValueError, '{}'.format_map, 'a')
1243 self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
1244
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001245 def test_format_huge_precision(self):
1246 format_string = ".{}f".format(sys.maxsize + 1)
1247 with self.assertRaises(ValueError):
1248 result = format(2.34, format_string)
1249
1250 def test_format_huge_width(self):
1251 format_string = "{}f".format(sys.maxsize + 1)
1252 with self.assertRaises(ValueError):
1253 result = format(2.34, format_string)
1254
1255 def test_format_huge_item_number(self):
1256 format_string = "{{{}:.6f}}".format(sys.maxsize + 1)
1257 with self.assertRaises(ValueError):
1258 result = format_string.format(2.34)
1259
Eric Smith8ec90442009-03-14 12:29:34 +00001260 def test_format_auto_numbering(self):
1261 class C:
1262 def __init__(self, x=100):
1263 self._x = x
1264 def __format__(self, spec):
1265 return spec
1266
1267 self.assertEqual('{}'.format(10), '10')
1268 self.assertEqual('{:5}'.format('s'), 's ')
1269 self.assertEqual('{!r}'.format('s'), "'s'")
1270 self.assertEqual('{._x}'.format(C(10)), '10')
1271 self.assertEqual('{[1]}'.format([1, 2]), '2')
1272 self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
1273 self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
1274
1275 self.assertEqual('a{:{}}b'.format('x', '^10'), 'a x b')
1276 self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
1277
1278 # can't mix and match numbering and auto-numbering
1279 self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
1280 self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
1281 self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
1282 self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
1283
1284 # can mix and match auto-numbering and named
1285 self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
1286 self.assertEqual('{}{f}'.format(4, f='test'), '4test')
1287 self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
1288 self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
1289
Walter Dörwald28256f22003-01-19 16:59:20 +00001290 def test_formatting(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +00001291 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
Walter Dörwald28256f22003-01-19 16:59:20 +00001292 # Testing Unicode formatting strings...
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001293 self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
1294 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000, 3.00')
1295 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000, 3.00')
1296 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000, 3.50')
1297 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000, 3.57')
1298 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
Walter Dörwald28256f22003-01-19 16:59:20 +00001299 if not sys.platform.startswith('java'):
Walter Dörwald67e83882007-05-05 12:26:27 +00001300 self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
Georg Brandl559e5d72008-06-11 18:37:52 +00001301 self.assertEqual("%r" % ("\u1234",), "'\u1234'")
1302 self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001303 self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
1304 self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
Walter Dörwald56fbcb52003-03-31 18:18:41 +00001305
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001306 self.assertEqual('%c' % 0x1234, '\u1234')
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00001307 self.assertEqual('%c' % 0x21483, '\U00021483')
1308 self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
1309 self.assertEqual('%c' % '\U00021483', '\U00021483')
1310 self.assertRaises(TypeError, "%c".__mod__, "aa")
Stefan Krah99212f62010-07-19 17:58:26 +00001311 self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -07001312 self.assertRaises(TypeError, "%i".__mod__, "aa")
Walter Dörwald28256f22003-01-19 16:59:20 +00001313
1314 # formatting jobs delegated from the string implementation:
Walter Dörwald28256f22003-01-19 16:59:20 +00001315 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001316 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1317 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1318 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1319 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1320 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1321 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
1322 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
1323 self.assertEqual('...%s...' % "abc", '...abc...')
1324 self.assertEqual('%*s' % (5,'abc',), ' abc')
1325 self.assertEqual('%*s' % (-5,'abc',), 'abc ')
1326 self.assertEqual('%*.*s' % (5,2,'abc',), ' ab')
1327 self.assertEqual('%*.*s' % (5,3,'abc',), ' abc')
1328 self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10 abc')
1329 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103 abc')
1330 self.assertEqual('%c' % 'a', 'a')
Neil Schemenauercf52c072005-08-12 17:34:58 +00001331 class Wrapper:
1332 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001333 return '\u1234'
1334 self.assertEqual('%s' % Wrapper(), '\u1234')
Walter Dörwald28256f22003-01-19 16:59:20 +00001335
Eric Smith741191f2009-05-06 13:08:15 +00001336 # issue 3382
1337 NAN = float('nan')
1338 INF = float('inf')
1339 self.assertEqual('%f' % NAN, 'nan')
1340 self.assertEqual('%F' % NAN, 'NAN')
1341 self.assertEqual('%f' % INF, 'inf')
1342 self.assertEqual('%F' % INF, 'INF')
1343
Victor Stinnerf59c28c2012-05-09 03:24:14 +02001344 # PEP 393
1345 self.assertEqual('%.1s' % "a\xe9\u20ac", 'a')
1346 self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9')
1347
Ethan Furmandf3ed242014-01-05 06:50:30 -08001348 #issue 19995
Ethan Furman9ab74802014-03-21 06:38:46 -07001349 class PseudoInt:
Ethan Furmandf3ed242014-01-05 06:50:30 -08001350 def __init__(self, value):
1351 self.value = int(value)
1352 def __int__(self):
1353 return self.value
1354 def __index__(self):
1355 return self.value
Ethan Furman9ab74802014-03-21 06:38:46 -07001356 class PseudoFloat:
Ethan Furmandf3ed242014-01-05 06:50:30 -08001357 def __init__(self, value):
1358 self.value = float(value)
1359 def __int__(self):
1360 return int(self.value)
Ethan Furman9ab74802014-03-21 06:38:46 -07001361 pi = PseudoFloat(3.1415)
1362 letter_m = PseudoInt(109)
Antoine Pitroueb168042014-01-10 00:02:38 +01001363 self.assertEqual('%x' % 42, '2a')
1364 self.assertEqual('%X' % 15, 'F')
1365 self.assertEqual('%o' % 9, '11')
1366 self.assertEqual('%c' % 109, 'm')
1367 self.assertEqual('%x' % letter_m, '6d')
1368 self.assertEqual('%X' % letter_m, '6D')
1369 self.assertEqual('%o' % letter_m, '155')
1370 self.assertEqual('%c' % letter_m, 'm')
Ethan Furman9ab74802014-03-21 06:38:46 -07001371 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14),
1372 self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11),
1373 self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79),
1374 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi),
1375 self.assertRaises(TypeError, operator.mod, '%c', pi),
Ethan Furmandf3ed242014-01-05 06:50:30 -08001376
Ethan Furmanfb137212013-08-31 10:18:55 -07001377 def test_formatting_with_enum(self):
1378 # issue18780
1379 import enum
1380 class Float(float, enum.Enum):
1381 PI = 3.1415926
1382 class Int(enum.IntEnum):
1383 IDES = 15
1384 class Str(str, enum.Enum):
1385 ABC = 'abc'
1386 # Testing Unicode formatting strings...
Ethan Furman13bdfa72013-08-31 12:48:51 -07001387 self.assertEqual("%s, %s" % (Str.ABC, Str.ABC),
1388 'Str.ABC, Str.ABC')
1389 self.assertEqual("%s, %s, %d, %i, %u, %f, %5.2f" %
1390 (Str.ABC, Str.ABC,
1391 Int.IDES, Int.IDES, Int.IDES,
1392 Float.PI, Float.PI),
1393 'Str.ABC, Str.ABC, 15, 15, 15, 3.141593, 3.14')
Ethan Furmanfb137212013-08-31 10:18:55 -07001394
1395 # formatting jobs delegated from the string implementation:
Ethan Furman13bdfa72013-08-31 12:48:51 -07001396 self.assertEqual('...%(foo)s...' % {'foo':Str.ABC},
1397 '...Str.ABC...')
1398 self.assertEqual('...%(foo)s...' % {'foo':Int.IDES},
1399 '...Int.IDES...')
1400 self.assertEqual('...%(foo)i...' % {'foo':Int.IDES},
1401 '...15...')
1402 self.assertEqual('...%(foo)d...' % {'foo':Int.IDES},
1403 '...15...')
1404 self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI},
1405 '...15...')
1406 self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123},
1407 '...3.141593...')
Ethan Furmanfb137212013-08-31 10:18:55 -07001408
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001409 def test_formatting_huge_precision(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001410 format_string = "%.{}f".format(sys.maxsize + 1)
1411 with self.assertRaises(ValueError):
1412 result = format_string % 2.34
1413
1414 @support.cpython_only
1415 def test_formatting_huge_precision_c_limits(self):
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001416 from _testcapi import INT_MAX
1417 format_string = "%.{}f".format(INT_MAX + 1)
1418 with self.assertRaises(ValueError):
1419 result = format_string % 2.34
1420
1421 def test_formatting_huge_width(self):
1422 format_string = "%{}f".format(sys.maxsize + 1)
1423 with self.assertRaises(ValueError):
1424 result = format_string % 2.34
1425
Ezio Melottiba42fd52011-04-26 06:09:45 +03001426 def test_startswith_endswith_errors(self):
1427 for meth in ('foo'.startswith, 'foo'.endswith):
Ezio Melottif2b3f782011-04-26 06:40:59 +03001428 with self.assertRaises(TypeError) as cm:
Ezio Melottiba42fd52011-04-26 06:09:45 +03001429 meth(['f'])
Ezio Melottif2b3f782011-04-26 06:40:59 +03001430 exc = str(cm.exception)
Ezio Melottiba42fd52011-04-26 06:09:45 +03001431 self.assertIn('str', exc)
1432 self.assertIn('tuple', exc)
1433
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001434 @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
Georg Brandlda6b1072006-01-20 17:48:54 +00001435 def test_format_float(self):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001436 # should not format with a comma, but always with C locale
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001437 self.assertEqual('1.0', '%.1f' % 1.0)
Georg Brandlda6b1072006-01-20 17:48:54 +00001438
Walter Dörwald28256f22003-01-19 16:59:20 +00001439 def test_constructor(self):
1440 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1441
1442 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001443 str('unicode remains unicode'),
1444 'unicode remains unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001445 )
1446
Victor Stinner07ac3eb2011-10-01 16:16:43 +02001447 for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
Serhiy Storchaka15095802015-11-25 15:47:01 +02001448 subclass = StrSubclass(text)
Victor Stinner07ac3eb2011-10-01 16:16:43 +02001449 self.assertEqual(str(subclass), text)
1450 self.assertEqual(len(subclass), len(text))
1451 if text == 'ascii':
1452 self.assertEqual(subclass.encode('ascii'), b'ascii')
1453 self.assertEqual(subclass.encode('utf-8'), b'ascii')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001454
Walter Dörwald28256f22003-01-19 16:59:20 +00001455 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001456 str('strings are converted to unicode'),
1457 'strings are converted to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001458 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001459
Walter Dörwald28256f22003-01-19 16:59:20 +00001460 class StringCompat:
1461 def __init__(self, x):
1462 self.x = x
1463 def __str__(self):
1464 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001465
Walter Dörwald28256f22003-01-19 16:59:20 +00001466 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001467 str(StringCompat('__str__ compatible objects are recognized')),
1468 '__str__ compatible objects are recognized'
Walter Dörwald28256f22003-01-19 16:59:20 +00001469 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001470
Walter Dörwald28256f22003-01-19 16:59:20 +00001471 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001472
Walter Dörwald28256f22003-01-19 16:59:20 +00001473 o = StringCompat('unicode(obj) is compatible to str()')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001474 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Walter Dörwald28256f22003-01-19 16:59:20 +00001475 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001476
Guido van Rossume2a383d2007-01-15 16:59:06 +00001477 for obj in (123, 123.45, 123):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001478 self.assertEqual(str(obj), str(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001479
Walter Dörwald28256f22003-01-19 16:59:20 +00001480 # unicode(obj, encoding, error) tests (this maps to
1481 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001482
Walter Dörwald28256f22003-01-19 16:59:20 +00001483 if not sys.platform.startswith('java'):
1484 self.assertRaises(
1485 TypeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001486 str,
1487 'decoding unicode is not supported',
Walter Dörwald28256f22003-01-19 16:59:20 +00001488 'utf-8',
1489 'strict'
1490 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001491
Walter Dörwald28256f22003-01-19 16:59:20 +00001492 self.assertEqual(
Walter Dörwald67e83882007-05-05 12:26:27 +00001493 str(b'strings are decoded to unicode', 'utf-8', 'strict'),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001494 'strings are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001495 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001496
Walter Dörwald28256f22003-01-19 16:59:20 +00001497 if not sys.platform.startswith('java'):
1498 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001499 str(
Guido van Rossumbae07c92007-10-08 02:46:15 +00001500 memoryview(b'character buffers are decoded to unicode'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001501 'utf-8',
1502 'strict'
1503 ),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001504 'character buffers are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001505 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001506
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001507 self.assertRaises(TypeError, str, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001508
Chris Jerdonek5fae0e52012-11-20 17:45:51 -08001509 def test_constructor_keyword_args(self):
1510 """Pass various keyword argument combinations to the constructor."""
1511 # The object argument can be passed as a keyword.
1512 self.assertEqual(str(object='foo'), 'foo')
1513 self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo')
1514 # The errors argument without encoding triggers "decode" mode.
1515 self.assertEqual(str(b'foo', errors='strict'), 'foo') # not "b'foo'"
1516 self.assertEqual(str(object=b'foo', errors='strict'), 'foo')
1517
1518 def test_constructor_defaults(self):
1519 """Check the constructor argument defaults."""
1520 # The object argument defaults to '' or b''.
1521 self.assertEqual(str(), '')
1522 self.assertEqual(str(errors='strict'), '')
1523 utf8_cent = '¢'.encode('utf-8')
1524 # The encoding argument defaults to utf-8.
1525 self.assertEqual(str(utf8_cent, errors='strict'), '¢')
1526 # The errors argument defaults to strict.
1527 self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii')
1528
Walter Dörwald28256f22003-01-19 16:59:20 +00001529 def test_codecs_utf7(self):
1530 utfTests = [
Walter Dörwald67e83882007-05-05 12:26:27 +00001531 ('A\u2262\u0391.', b'A+ImIDkQ.'), # RFC2152 example
1532 ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'), # RFC2152 example
1533 ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'), # RFC2152 example
1534 ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1535 ('+', b'+-'),
1536 ('+-', b'+--'),
1537 ('+?', b'+-?'),
1538 ('\?', b'+AFw?'),
1539 ('+?', b'+-?'),
1540 (r'\\?', b'+AFwAXA?'),
1541 (r'\\\?', b'+AFwAXABc?'),
Antoine Pitrou244651a2009-05-04 18:56:13 +00001542 (r'++--', b'+-+---'),
1543 ('\U000abcde', b'+2m/c3g-'), # surrogate pairs
1544 ('/', b'/'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001545 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001546
Walter Dörwald28256f22003-01-19 16:59:20 +00001547 for (x, y) in utfTests:
1548 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001549
Antoine Pitrou5418ee02011-11-15 01:42:21 +01001550 # Unpaired surrogates are passed through
1551 self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
1552 self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
1553 self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
1554 self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
1555 self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
1556 self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
1557 self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
1558 self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001559
Antoine Pitrou5418ee02011-11-15 01:42:21 +01001560 self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
1561 self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001562
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001563 # Issue #2242: crash on some Windows/MSVC versions
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001564 self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')
Antoine Pitrou244651a2009-05-04 18:56:13 +00001565
1566 # Direct encoded characters
1567 set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1568 # Optional direct characters
1569 set_o = '!"#$%&*;<=>@[]^_`{|}'
1570 for c in set_d:
1571 self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1572 self.assertEqual(c.encode('ascii').decode('utf7'), c)
1573 for c in set_o:
1574 self.assertEqual(c.encode('ascii').decode('utf7'), c)
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001575
Walter Dörwald28256f22003-01-19 16:59:20 +00001576 def test_codecs_utf8(self):
Walter Dörwald67e83882007-05-05 12:26:27 +00001577 self.assertEqual(''.encode('utf-8'), b'')
1578 self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
Ezio Melottia9860ae2011-10-04 19:06:00 +03001579 self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82')
1580 self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96')
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001581 self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1582 self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
Ezio Melottia9860ae2011-10-04 19:06:00 +03001583 self.assertEqual(('\U00010002'*10).encode('utf-8'),
1584 b'\xf0\x90\x80\x82'*10)
Walter Dörwald28256f22003-01-19 16:59:20 +00001585 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001586 '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1587 '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1588 '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1589 '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1590 '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1591 ' Nunstuck git und'.encode('utf-8'),
Walter Dörwald67e83882007-05-05 12:26:27 +00001592 b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1593 b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1594 b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1595 b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1596 b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1597 b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1598 b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1599 b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1600 b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1601 b'\xe3\x80\x8cWenn ist das Nunstuck git und'
Walter Dörwald28256f22003-01-19 16:59:20 +00001602 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001603
Walter Dörwald28256f22003-01-19 16:59:20 +00001604 # UTF-8 specific decoding tests
Walter Dörwald67e83882007-05-05 12:26:27 +00001605 self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1606 self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1607 self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001608
Walter Dörwald28256f22003-01-19 16:59:20 +00001609 # Other possible utf-8 test cases:
1610 # * strict decoding testing for all of the
1611 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001612
Ezio Melotti57221d02010-07-01 07:32:02 +00001613 def test_utf8_decode_valid_sequences(self):
1614 sequences = [
1615 # single byte
1616 (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1617 # 2 bytes
1618 (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1619 # 3 bytes
1620 (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1621 (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1622 # 4 bytes
1623 (b'\xF0\x90\x80\x80', '\U00010000'),
1624 (b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1625 ]
1626 for seq, res in sequences:
1627 self.assertEqual(seq.decode('utf-8'), res)
1628
1629
1630 def test_utf8_decode_invalid_sequences(self):
1631 # continuation bytes in a sequence of 2, 3, or 4 bytes
1632 continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001633 # start bytes of a 2-byte sequence equivalent to code points < 0x7F
Ezio Melotti57221d02010-07-01 07:32:02 +00001634 invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001635 # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
Ezio Melotti57221d02010-07-01 07:32:02 +00001636 invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1637 invalid_start_bytes = (
1638 continuation_bytes + invalid_2B_seq_start_bytes +
1639 invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1640 )
1641
1642 for byte in invalid_start_bytes:
1643 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1644
1645 for sb in invalid_2B_seq_start_bytes:
1646 for cb in continuation_bytes:
1647 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1648
1649 for sb in invalid_4B_seq_start_bytes:
1650 for cb1 in continuation_bytes[:3]:
1651 for cb3 in continuation_bytes[:3]:
1652 self.assertRaises(UnicodeDecodeError,
1653 (sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1654
1655 for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1656 self.assertRaises(UnicodeDecodeError,
1657 (b'\xE0'+cb+b'\x80').decode, 'utf-8')
1658 self.assertRaises(UnicodeDecodeError,
1659 (b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1660 # surrogates
1661 for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1662 self.assertRaises(UnicodeDecodeError,
1663 (b'\xED'+cb+b'\x80').decode, 'utf-8')
1664 self.assertRaises(UnicodeDecodeError,
1665 (b'\xED'+cb+b'\xBF').decode, 'utf-8')
1666 for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1667 self.assertRaises(UnicodeDecodeError,
1668 (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1669 self.assertRaises(UnicodeDecodeError,
1670 (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1671 for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1672 self.assertRaises(UnicodeDecodeError,
1673 (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1674 self.assertRaises(UnicodeDecodeError,
1675 (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1676
1677 def test_issue8271(self):
1678 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1679 # only the start byte and the continuation byte(s) are now considered
1680 # invalid, instead of the number of bytes specified by the start byte.
1681 # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1682 # table 3-8, Row 2) for more information about the algorithm used.
1683 FFFD = '\ufffd'
1684 sequences = [
1685 # invalid start bytes
1686 (b'\x80', FFFD), # continuation byte
1687 (b'\x80\x80', FFFD*2), # 2 continuation bytes
1688 (b'\xc0', FFFD),
1689 (b'\xc0\xc0', FFFD*2),
1690 (b'\xc1', FFFD),
1691 (b'\xc1\xc0', FFFD*2),
1692 (b'\xc0\xc1', FFFD*2),
1693 # with start byte of a 2-byte sequence
1694 (b'\xc2', FFFD), # only the start byte
1695 (b'\xc2\xc2', FFFD*2), # 2 start bytes
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001696 (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes
Ezio Melotti57221d02010-07-01 07:32:02 +00001697 (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1698 # with start byte of a 3-byte sequence
1699 (b'\xe1', FFFD), # only the start byte
1700 (b'\xe1\xe1', FFFD*2), # 2 start bytes
1701 (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1702 (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1703 (b'\xe1\x80', FFFD), # only 1 continuation byte
1704 (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1705 (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1706 (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1707 (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1708 (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1709 (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1710 # with start byte of a 4-byte sequence
1711 (b'\xf1', FFFD), # only the start byte
1712 (b'\xf1\xf1', FFFD*2), # 2 start bytes
1713 (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1714 (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1715 (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1716 (b'\xf1\x80', FFFD), # only 1 continuation bytes
1717 (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1718 (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1719 (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1720 (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1721 (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1722 (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1723 (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1724 (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1725 (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1726 (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1727 (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1728 (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1729 # with invalid start byte of a 4-byte sequence (rfc2279)
1730 (b'\xf5', FFFD), # only the start byte
1731 (b'\xf5\xf5', FFFD*2), # 2 start bytes
1732 (b'\xf5\x80', FFFD*2), # only 1 continuation byte
1733 (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1734 (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1735 (b'\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
1736 (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1737 (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1738 # with invalid start byte of a 5-byte sequence (rfc2279)
1739 (b'\xf8', FFFD), # only the start byte
1740 (b'\xf8\xf8', FFFD*2), # 2 start bytes
1741 (b'\xf8\x80', FFFD*2), # only one continuation byte
1742 (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1743 (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1744 # with invalid start byte of a 6-byte sequence (rfc2279)
1745 (b'\xfc', FFFD), # only the start byte
1746 (b'\xfc\xfc', FFFD*2), # 2 start bytes
1747 (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1748 (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1749 # invalid start byte
1750 (b'\xfe', FFFD),
1751 (b'\xfe\x80\x80', FFFD*3),
1752 # other sequences
1753 (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1754 (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1755 (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1756 (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1757 '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1758 ]
1759 for n, (seq, res) in enumerate(sequences):
1760 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1761 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1762 self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1763 self.assertEqual(seq.decode('utf-8', 'ignore'),
1764 res.replace('\uFFFD', ''))
1765
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001766 def to_bytestring(self, seq):
1767 return bytes(int(c, 16) for c in seq.split())
1768
1769 def assertCorrectUTF8Decoding(self, seq, res, err):
1770 """
1771 Check that an invalid UTF-8 sequence raises an UnicodeDecodeError when
1772 'strict' is used, returns res when 'replace' is used, and that doesn't
1773 return anything when 'ignore' is used.
1774 """
1775 with self.assertRaises(UnicodeDecodeError) as cm:
1776 seq.decode('utf-8')
1777 exc = cm.exception
1778
1779 self.assertIn(err, str(exc))
1780 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1781 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'),
1782 'aaaa' + res + 'bbbb')
1783 res = res.replace('\ufffd', '')
1784 self.assertEqual(seq.decode('utf-8', 'ignore'), res)
1785 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'),
1786 'aaaa' + res + 'bbbb')
1787
1788 def test_invalid_start_byte(self):
1789 """
1790 Test that an 'invalid start byte' error is raised when the first byte
1791 is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
1792 4-bytes sequence. The invalid start byte is replaced with a single
1793 U+FFFD when errors='replace'.
1794 E.g. <80> is a continuation byte and can appear only after a start byte.
1795 """
1796 FFFD = '\ufffd'
1797 for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
1798 self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd',
1799 'invalid start byte')
1800
1801 def test_unexpected_end_of_data(self):
1802 """
1803 Test that an 'unexpected end of data' error is raised when the string
1804 ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
1805 enough continuation bytes. The incomplete sequence is replaced with a
1806 single U+FFFD when errors='replace'.
1807 E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
1808 sequence, but it's followed by only 2 valid continuation bytes and the
1809 last continuation bytes is missing.
1810 Note: the continuation bytes must be all valid, if one of them is
1811 invalid another error will be raised.
1812 """
1813 sequences = [
1814 'C2', 'DF',
1815 'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
1816 'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
1817 'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
1818 'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
1819 'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
1820 'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
1821 ]
1822 FFFD = '\ufffd'
1823 for seq in sequences:
1824 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), '\ufffd',
1825 'unexpected end of data')
1826
1827 def test_invalid_cb_for_2bytes_seq(self):
1828 """
1829 Test that an 'invalid continuation byte' error is raised when the
1830 continuation byte of a 2-bytes sequence is invalid. The start byte
1831 is replaced by a single U+FFFD and the second byte is handled
1832 separately when errors='replace'.
1833 E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
1834 sequence, but 41 is not a valid continuation byte because it's the
1835 ASCII letter 'A'.
1836 """
1837 FFFD = '\ufffd'
1838 FFFDx2 = FFFD * 2
1839 sequences = [
1840 ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'),
1841 ('C2 C0', FFFDx2), ('C2 FF', FFFDx2),
1842 ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'),
1843 ('DF C0', FFFDx2), ('DF FF', FFFDx2),
1844 ]
1845 for seq, res in sequences:
1846 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
1847 'invalid continuation byte')
1848
1849 def test_invalid_cb_for_3bytes_seq(self):
1850 """
1851 Test that an 'invalid continuation byte' error is raised when the
1852 continuation byte(s) of a 3-bytes sequence are invalid. When
1853 errors='replace', if the first continuation byte is valid, the first
1854 two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
1855 third byte is handled separately, otherwise only the start byte is
1856 replaced with a U+FFFD and the other continuation bytes are handled
1857 separately.
1858 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1859 sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1860 because it's the ASCII letter 'A'.
1861 Note: when the start byte is E0 or ED, the valid ranges for the first
1862 continuation byte are limited to A0..BF and 80..9F respectively.
1863 Python 2 used to consider all the bytes in range 80..BF valid when the
1864 start byte was ED. This is fixed in Python 3.
1865 """
1866 FFFD = '\ufffd'
1867 FFFDx2 = FFFD * 2
1868 sequences = [
1869 ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2),
1870 ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
1871 ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'),
1872 ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
1873 ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'),
1874 ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'),
1875 ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
1876 ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'),
1877 ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
1878 ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'),
1879 ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'),
1880 ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
1881 ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'),
1882 ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
1883 ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'),
1884 ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'),
1885 ('ED 7F', FFFD+'\x7f'),
1886 ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
1887 ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'),
1888 ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2),
1889 ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'),
1890 ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2),
1891 ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'),
1892 ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
1893 ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'),
1894 ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
1895 ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'),
1896 ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'),
1897 ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
1898 ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'),
1899 ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
1900 ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'),
1901 ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
1902 ]
1903 for seq, res in sequences:
1904 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
1905 'invalid continuation byte')
1906
1907 def test_invalid_cb_for_4bytes_seq(self):
1908 """
1909 Test that an 'invalid continuation byte' error is raised when the
1910 continuation byte(s) of a 4-bytes sequence are invalid. When
1911 errors='replace',the start byte and all the following valid
1912 continuation bytes are replaced with a single U+FFFD, and all the bytes
1913 starting from the first invalid continuation bytes (included) are
1914 handled separately.
1915 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1916 sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1917 because it's the ASCII letter 'A'.
1918 Note: when the start byte is E0 or ED, the valid ranges for the first
1919 continuation byte are limited to A0..BF and 80..9F respectively.
1920 However, when the start byte is ED, Python 2 considers all the bytes
1921 in range 80..BF valid. This is fixed in Python 3.
1922 """
1923 FFFD = '\ufffd'
1924 FFFDx2 = FFFD * 2
1925 sequences = [
1926 ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2),
1927 ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
1928 ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'),
1929 ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
1930 ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'),
1931 ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
1932 ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'),
1933 ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
1934 ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'),
1935 ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
1936 ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'),
1937 ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
1938 ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'),
1939 ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
1940 ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2),
1941 ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'),
1942 ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2),
1943 ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'),
1944 ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2),
1945 ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'),
1946 ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2),
1947 ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'),
1948 ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2),
1949 ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'),
1950 ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2),
1951 ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'),
1952 ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2),
1953 ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'),
1954 ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
1955 ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'),
1956 ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
1957 ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'),
1958 ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
1959 ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'),
1960 ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
1961 ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'),
1962 ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
1963 ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'),
1964 ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
1965 ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'),
1966 ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
1967 ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2),
1968 ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
1969 ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'),
1970 ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
1971 ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'),
1972 ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
1973 ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'),
1974 ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
1975 ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'),
1976 ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
1977 ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'),
1978 ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
1979 ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'),
1980 ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
1981 ]
1982 for seq, res in sequences:
1983 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
1984 'invalid continuation byte')
1985
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00001986 def test_codecs_idna(self):
1987 # Test whether trailing dot is preserved
Walter Dörwald1324c6f2007-05-11 19:57:05 +00001988 self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00001989
Walter Dörwald28256f22003-01-19 16:59:20 +00001990 def test_codecs_errors(self):
1991 # Error handling (encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001992 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
1993 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
Walter Dörwald67e83882007-05-05 12:26:27 +00001994 self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
1995 self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
Benjamin Peterson308d6372009-09-18 21:42:35 +00001996 self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
1997 'Andr\202 x'.encode('ascii', errors='replace'))
1998 self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
1999 'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002000
Walter Dörwald28256f22003-01-19 16:59:20 +00002001 # Error handling (decoding)
Walter Dörwald67e83882007-05-05 12:26:27 +00002002 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
2003 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
2004 self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
2005 self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03002006 self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002007
Walter Dörwald28256f22003-01-19 16:59:20 +00002008 # Error handling (unknown character names)
Guido van Rossum39478e82007-08-27 17:23:59 +00002009 self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002010
Walter Dörwald28256f22003-01-19 16:59:20 +00002011 # Error handling (truncated escape sequence)
Guido van Rossum9c627722007-08-27 18:31:48 +00002012 self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00002013
Guido van Rossum9c627722007-08-27 18:31:48 +00002014 self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
2015 self.assertRaises(TypeError, str, b"hello", "test.unicode2")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002016 self.assertRaises(TypeError, "hello".encode, "test.unicode1")
2017 self.assertRaises(TypeError, "hello".encode, "test.unicode2")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00002018
Walter Dörwald28256f22003-01-19 16:59:20 +00002019 # Error handling (wrong arguments)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002020 self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002021
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00002022 # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00002023 self.assertRaises(UnicodeError, float, "\ud800")
2024 self.assertRaises(UnicodeError, float, "\udf00")
2025 self.assertRaises(UnicodeError, complex, "\ud800")
2026 self.assertRaises(UnicodeError, complex, "\udf00")
Guido van Rossum97064862000-04-10 13:52:48 +00002027
Walter Dörwald28256f22003-01-19 16:59:20 +00002028 def test_codecs(self):
2029 # Encoding
Walter Dörwald67e83882007-05-05 12:26:27 +00002030 self.assertEqual('hello'.encode('ascii'), b'hello')
2031 self.assertEqual('hello'.encode('utf-7'), b'hello')
2032 self.assertEqual('hello'.encode('utf-8'), b'hello')
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002033 self.assertEqual('hello'.encode('utf-8'), b'hello')
Walter Dörwald67e83882007-05-05 12:26:27 +00002034 self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
2035 self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
2036 self.assertEqual('hello'.encode('latin-1'), b'hello')
Guido van Rossum97064862000-04-10 13:52:48 +00002037
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002038 # Default encoding is utf-8
2039 self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
2040
Walter Dörwald28256f22003-01-19 16:59:20 +00002041 # Roundtrip safety for BMP (just the first 1024 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002042 for c in range(1024):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002043 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002044 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
2045 'utf-16-be', 'raw_unicode_escape',
2046 'unicode_escape', 'unicode_internal'):
Victor Stinner040e16e2011-11-15 22:44:05 +01002047 with warnings.catch_warnings():
2048 # unicode-internal has been deprecated
2049 warnings.simplefilter("ignore", DeprecationWarning)
2050
2051 self.assertEqual(str(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002052
Walter Dörwald28256f22003-01-19 16:59:20 +00002053 # Roundtrip safety for BMP (just the first 256 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002054 for c in range(256):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002055 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002056 for encoding in ('latin-1',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002057 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002058
Walter Dörwald28256f22003-01-19 16:59:20 +00002059 # Roundtrip safety for BMP (just the first 128 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002060 for c in range(128):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002061 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002062 for encoding in ('ascii',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002063 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002064
Walter Dörwald28256f22003-01-19 16:59:20 +00002065 # Roundtrip safety for non-BMP (just a few chars)
Victor Stinner040e16e2011-11-15 22:44:05 +01002066 with warnings.catch_warnings():
2067 # unicode-internal has been deprecated
2068 warnings.simplefilter("ignore", DeprecationWarning)
2069
2070 u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
2071 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
2072 'raw_unicode_escape',
2073 'unicode_escape', 'unicode_internal'):
2074 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002075
Antoine Pitrou51f66482011-11-11 13:35:44 +01002076 # UTF-8 must be roundtrip safe for all code points
2077 # (except surrogates, which are forbidden).
2078 u = ''.join(map(chr, list(range(0, 0xd800)) +
Ezio Melotti40dc9192011-11-11 17:00:46 +02002079 list(range(0xe000, 0x110000))))
Walter Dörwald28256f22003-01-19 16:59:20 +00002080 for encoding in ('utf-8',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002081 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002082
Walter Dörwald28256f22003-01-19 16:59:20 +00002083 def test_codecs_charmap(self):
2084 # 0-127
Guido van Rossum805365e2007-05-07 22:24:25 +00002085 s = bytes(range(128))
Walter Dörwald28256f22003-01-19 16:59:20 +00002086 for encoding in (
Andrew Kuchlingad8156e2013-11-10 13:44:30 -05002087 'cp037', 'cp1026', 'cp273',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00002088 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2089 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02002090 'cp863', 'cp865', 'cp866', 'cp1125',
Walter Dörwald28256f22003-01-19 16:59:20 +00002091 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2092 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03002093 'iso8859_7', 'iso8859_9',
2094 'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'latin_1',
Walter Dörwald28256f22003-01-19 16:59:20 +00002095 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002096
Walter Dörwald28256f22003-01-19 16:59:20 +00002097 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2098 'cp1256', 'cp1257', 'cp1258',
2099 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002100
Walter Dörwald28256f22003-01-19 16:59:20 +00002101 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2102 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +00002103
Walter Dörwald28256f22003-01-19 16:59:20 +00002104 ### These have undefined mappings:
2105 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +00002106
Walter Dörwald28256f22003-01-19 16:59:20 +00002107 ### These fail the round-trip:
2108 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +00002109
Walter Dörwald28256f22003-01-19 16:59:20 +00002110 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002111 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002112
Walter Dörwald28256f22003-01-19 16:59:20 +00002113 # 128-255
Guido van Rossum805365e2007-05-07 22:24:25 +00002114 s = bytes(range(128, 256))
Walter Dörwald28256f22003-01-19 16:59:20 +00002115 for encoding in (
Andrew Kuchlingad8156e2013-11-10 13:44:30 -05002116 'cp037', 'cp1026', 'cp273',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00002117 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2118 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02002119 'cp863', 'cp865', 'cp866', 'cp1125',
Walter Dörwald28256f22003-01-19 16:59:20 +00002120 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2121 'iso8859_2', 'iso8859_4', 'iso8859_5',
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03002122 'iso8859_9', 'koi8_r', 'koi8_u', 'latin_1',
Walter Dörwald28256f22003-01-19 16:59:20 +00002123 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +00002124
Walter Dörwald28256f22003-01-19 16:59:20 +00002125 ### These have undefined mappings:
2126 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2127 #'cp1256', 'cp1257', 'cp1258',
2128 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03002129 #'iso8859_3', 'iso8859_6', 'iso8859_7', 'koi8_t', 'kz1048',
Walter Dörwald28256f22003-01-19 16:59:20 +00002130 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +00002131
Walter Dörwald28256f22003-01-19 16:59:20 +00002132 ### These fail the round-trip:
2133 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +00002134
Walter Dörwald28256f22003-01-19 16:59:20 +00002135 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002136 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002137
Walter Dörwald28256f22003-01-19 16:59:20 +00002138 def test_concatenation(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002139 self.assertEqual(("abc" "def"), "abcdef")
2140 self.assertEqual(("abc" "def"), "abcdef")
2141 self.assertEqual(("abc" "def"), "abcdef")
2142 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2143 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +00002144
Walter Dörwald28256f22003-01-19 16:59:20 +00002145 def test_printing(self):
2146 class BitBucket:
2147 def write(self, text):
2148 pass
Fred Drake004d5e62000-10-23 17:22:08 +00002149
Walter Dörwald28256f22003-01-19 16:59:20 +00002150 out = BitBucket()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002151 print('abc', file=out)
2152 print('abc', 'def', file=out)
2153 print('abc', 'def', file=out)
2154 print('abc', 'def', file=out)
2155 print('abc\n', file=out)
2156 print('abc\n', end=' ', file=out)
2157 print('abc\n', end=' ', file=out)
2158 print('def\n', file=out)
2159 print('def\n', file=out)
Fred Drake004d5e62000-10-23 17:22:08 +00002160
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002161 def test_ucs4(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002162 x = '\U00100000'
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002163 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
2164 self.assertEqual(x, y)
2165
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002166 y = br'\U00100000'
2167 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2168 self.assertEqual(x, y)
2169 y = br'\U00010000'
2170 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2171 self.assertEqual(x, y)
Christian Heimesfe337bf2008-03-23 21:54:12 +00002172
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002173 try:
2174 br'\U11111111'.decode("raw-unicode-escape")
2175 except UnicodeDecodeError as e:
2176 self.assertEqual(e.start, 0)
2177 self.assertEqual(e.end, 10)
2178 else:
2179 self.fail("Should have raised UnicodeDecodeError")
Christian Heimesfe337bf2008-03-23 21:54:12 +00002180
Brett Cannonc3647ac2005-04-26 03:45:26 +00002181 def test_conversion(self):
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002182 # Make sure __str__() works properly
2183 class ObjectToStr:
Brett Cannonc3647ac2005-04-26 03:45:26 +00002184 def __str__(self):
2185 return "foo"
2186
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002187 class StrSubclassToStr(str):
Guido van Rossum98297ee2007-11-06 21:34:58 +00002188 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002189 return "foo"
Brett Cannonc3647ac2005-04-26 03:45:26 +00002190
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002191 class StrSubclassToStrSubclass(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002192 def __new__(cls, content=""):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002193 return str.__new__(cls, 2*content)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002194 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002195 return self
2196
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002197 self.assertEqual(str(ObjectToStr()), "foo")
2198 self.assertEqual(str(StrSubclassToStr("bar")), "foo")
2199 s = str(StrSubclassToStrSubclass("foo"))
2200 self.assertEqual(s, "foofoo")
2201 self.assertIs(type(s), StrSubclassToStrSubclass)
Serhiy Storchaka15095802015-11-25 15:47:01 +02002202 s = StrSubclass(StrSubclassToStrSubclass("foo"))
2203 self.assertEqual(s, "foofoo")
2204 self.assertIs(type(s), StrSubclass)
Brett Cannonc3647ac2005-04-26 03:45:26 +00002205
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002206 def test_unicode_repr(self):
2207 class s1:
2208 def __repr__(self):
2209 return '\\n'
2210
2211 class s2:
2212 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002213 return '\\n'
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002214
2215 self.assertEqual(repr(s1()), '\\n')
2216 self.assertEqual(repr(s2()), '\\n')
2217
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002218 def test_printable_repr(self):
2219 self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
Martin v. Löwisbaecd722010-10-11 22:42:28 +00002220 self.assertEqual(repr('\U00014000'), "'\\U00014000'") # nonprintable
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002221
Zachary Ware9fe6d862013-12-08 00:20:35 -06002222 # This test only affects 32-bit platforms because expandtabs can only take
2223 # an int as the max value, not a 64-bit C long. If expandtabs is changed
2224 # to take a 64-bit long, this test should apply to all platforms.
2225 @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4,
2226 'only applies to 32-bit platforms')
Guido van Rossumcd16bf62007-06-13 18:07:49 +00002227 def test_expandtabs_overflows_gracefully(self):
Christian Heimesa37d4c62007-12-04 23:02:19 +00002228 self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002229
Victor Stinner1d972ad2011-10-07 13:31:46 +02002230 @support.cpython_only
Antoine Pitroue19aa382011-10-04 16:04:01 +02002231 def test_expandtabs_optimization(self):
2232 s = 'abc'
2233 self.assertIs(s.expandtabs(), s)
2234
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00002235 def test_raiseMemError(self):
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002236 if struct.calcsize('P') == 8:
2237 # 64 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02002238 ascii_struct_size = 48
2239 compact_struct_size = 72
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 else:
2241 # 32 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02002242 ascii_struct_size = 24
2243 compact_struct_size = 36
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002244
2245 for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
2246 code = ord(char)
2247 if code < 0x100:
2248 char_size = 1 # sizeof(Py_UCS1)
2249 struct_size = ascii_struct_size
2250 elif code < 0x10000:
2251 char_size = 2 # sizeof(Py_UCS2)
2252 struct_size = compact_struct_size
2253 else:
2254 char_size = 4 # sizeof(Py_UCS4)
2255 struct_size = compact_struct_size
2256 # Note: sys.maxsize is half of the actual max allocation because of
Martin v. Löwis287eca62011-09-28 10:03:28 +02002257 # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
2258 # be allocatable, given enough memory.
2259 maxlen = ((sys.maxsize - struct_size) // char_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002260 alloc = lambda: char * maxlen
2261 self.assertRaises(MemoryError, alloc)
2262 self.assertRaises(MemoryError, alloc)
Antoine Pitrou3db3e872008-08-17 17:06:51 +00002263
Victor Stinner808fc0a2010-03-22 12:50:40 +00002264 def test_format_subclass(self):
2265 class S(str):
2266 def __str__(self):
2267 return '__str__ overridden'
2268 s = S('xxx')
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002269 self.assertEqual("%s" % s, '__str__ overridden')
2270 self.assertEqual("{}".format(s), '__str__ overridden')
Victor Stinner808fc0a2010-03-22 12:50:40 +00002271
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002272 # Test PyUnicode_FromFormat()
Victor Stinner1205f272010-09-11 00:54:47 +00002273 def test_from_format(self):
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002274 support.import_module('ctypes')
Victor Stinner15a11362012-10-06 23:48:20 +02002275 from ctypes import (
2276 pythonapi, py_object, sizeof,
Victor Stinner6d970f42011-03-02 00:04:25 +00002277 c_int, c_long, c_longlong, c_ssize_t,
Victor Stinner15a11362012-10-06 23:48:20 +02002278 c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002279 name = "PyUnicode_FromFormat"
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002280 _PyUnicode_FromFormat = getattr(pythonapi, name)
2281 _PyUnicode_FromFormat.restype = py_object
2282
2283 def PyUnicode_FromFormat(format, *args):
2284 cargs = tuple(
2285 py_object(arg) if isinstance(arg, str) else arg
2286 for arg in args)
2287 return _PyUnicode_FromFormat(format, *cargs)
Victor Stinner1205f272010-09-11 00:54:47 +00002288
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002289 def check_format(expected, format, *args):
2290 text = PyUnicode_FromFormat(format, *args)
2291 self.assertEqual(expected, text)
2292
Victor Stinner1205f272010-09-11 00:54:47 +00002293 # ascii format, non-ascii argument
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002294 check_format('ascii\x7f=unicode\xe9',
2295 b'ascii\x7f=%U', 'unicode\xe9')
Victor Stinner1205f272010-09-11 00:54:47 +00002296
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002297 # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
2298 # raises an error
Ezio Melottied3a7d22010-12-01 02:32:32 +00002299 self.assertRaisesRegex(ValueError,
Victor Stinner1205f272010-09-11 00:54:47 +00002300 '^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
Victor Stinner4c7db312010-09-12 07:51:18 +00002301 'string, got a non-ASCII byte: 0xe9$',
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002302 PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00002303
Victor Stinner96865452011-03-01 23:44:09 +00002304 # test "%c"
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002305 check_format('\uabcd',
2306 b'%c', c_int(0xabcd))
2307 check_format('\U0010ffff',
2308 b'%c', c_int(0x10ffff))
Serhiy Storchaka8eeae212013-06-23 20:12:14 +03002309 with self.assertRaises(OverflowError):
2310 PyUnicode_FromFormat(b'%c', c_int(0x110000))
Serhiy Storchaka31b1c8b2013-06-12 09:20:44 +03002311 # Issue #18183
Serhiy Storchakaf15ffe02013-06-12 09:28:20 +03002312 check_format('\U00010000\U00100000',
2313 b'%c%c', c_int(0x10000), c_int(0x100000))
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002314
Victor Stinner96865452011-03-01 23:44:09 +00002315 # test "%"
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002316 check_format('%',
2317 b'%')
2318 check_format('%',
2319 b'%%')
2320 check_format('%s',
2321 b'%%s')
2322 check_format('[%]',
2323 b'[%%]')
2324 check_format('%abc',
2325 b'%%%s', b'abc')
2326
2327 # truncated string
2328 check_format('abc',
2329 b'%.3s', b'abcdef')
2330 check_format('abc[\ufffd',
2331 b'%.5s', 'abc[\u20ac]'.encode('utf8'))
2332 check_format("'\\u20acABC'",
2333 b'%A', '\u20acABC')
2334 check_format("'\\u20",
2335 b'%.5A', '\u20acABCDEF')
2336 check_format("'\u20acABC'",
2337 b'%R', '\u20acABC')
2338 check_format("'\u20acA",
2339 b'%.3R', '\u20acABCDEF')
2340 check_format('\u20acAB',
2341 b'%.3S', '\u20acABCDEF')
2342 check_format('\u20acAB',
2343 b'%.3U', '\u20acABCDEF')
2344 check_format('\u20acAB',
2345 b'%.3V', '\u20acABCDEF', None)
2346 check_format('abc[\ufffd',
2347 b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
2348
2349 # following tests comes from #7330
2350 # test width modifier and precision modifier with %S
2351 check_format("repr= abc",
2352 b'repr=%5S', 'abc')
2353 check_format("repr=ab",
2354 b'repr=%.2S', 'abc')
2355 check_format("repr= ab",
2356 b'repr=%5.2S', 'abc')
2357
2358 # test width modifier and precision modifier with %R
2359 check_format("repr= 'abc'",
2360 b'repr=%8R', 'abc')
2361 check_format("repr='ab",
2362 b'repr=%.3R', 'abc')
2363 check_format("repr= 'ab",
2364 b'repr=%5.3R', 'abc')
2365
2366 # test width modifier and precision modifier with %A
2367 check_format("repr= 'abc'",
2368 b'repr=%8A', 'abc')
2369 check_format("repr='ab",
2370 b'repr=%.3A', 'abc')
2371 check_format("repr= 'ab",
2372 b'repr=%5.3A', 'abc')
2373
2374 # test width modifier and precision modifier with %s
2375 check_format("repr= abc",
2376 b'repr=%5s', b'abc')
2377 check_format("repr=ab",
2378 b'repr=%.2s', b'abc')
2379 check_format("repr= ab",
2380 b'repr=%5.2s', b'abc')
2381
2382 # test width modifier and precision modifier with %U
2383 check_format("repr= abc",
2384 b'repr=%5U', 'abc')
2385 check_format("repr=ab",
2386 b'repr=%.2U', 'abc')
2387 check_format("repr= ab",
2388 b'repr=%5.2U', 'abc')
2389
2390 # test width modifier and precision modifier with %V
2391 check_format("repr= abc",
2392 b'repr=%5V', 'abc', b'123')
2393 check_format("repr=ab",
2394 b'repr=%.2V', 'abc', b'123')
2395 check_format("repr= ab",
2396 b'repr=%5.2V', 'abc', b'123')
2397 check_format("repr= 123",
2398 b'repr=%5V', None, b'123')
2399 check_format("repr=12",
2400 b'repr=%.2V', None, b'123')
2401 check_format("repr= 12",
2402 b'repr=%5.2V', None, b'123')
Victor Stinner96865452011-03-01 23:44:09 +00002403
Victor Stinner6d970f42011-03-02 00:04:25 +00002404 # test integer formats (%i, %d, %u)
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002405 check_format('010',
2406 b'%03i', c_int(10))
2407 check_format('0010',
2408 b'%0.4i', c_int(10))
2409 check_format('-123',
2410 b'%i', c_int(-123))
2411 check_format('-123',
2412 b'%li', c_long(-123))
2413 check_format('-123',
2414 b'%lli', c_longlong(-123))
2415 check_format('-123',
2416 b'%zi', c_ssize_t(-123))
Victor Stinner96865452011-03-01 23:44:09 +00002417
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002418 check_format('-123',
2419 b'%d', c_int(-123))
2420 check_format('-123',
2421 b'%ld', c_long(-123))
2422 check_format('-123',
2423 b'%lld', c_longlong(-123))
2424 check_format('-123',
2425 b'%zd', c_ssize_t(-123))
Victor Stinner96865452011-03-01 23:44:09 +00002426
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002427 check_format('123',
2428 b'%u', c_uint(123))
2429 check_format('123',
2430 b'%lu', c_ulong(123))
2431 check_format('123',
2432 b'%llu', c_ulonglong(123))
2433 check_format('123',
2434 b'%zu', c_size_t(123))
Victor Stinner6d970f42011-03-02 00:04:25 +00002435
Victor Stinner15a11362012-10-06 23:48:20 +02002436 # test long output
2437 min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
2438 max_longlong = -min_longlong - 1
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002439 check_format(str(min_longlong),
2440 b'%lld', c_longlong(min_longlong))
2441 check_format(str(max_longlong),
2442 b'%lld', c_longlong(max_longlong))
Victor Stinner15a11362012-10-06 23:48:20 +02002443 max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002444 check_format(str(max_ulonglong),
2445 b'%llu', c_ulonglong(max_ulonglong))
Victor Stinner15a11362012-10-06 23:48:20 +02002446 PyUnicode_FromFormat(b'%p', c_void_p(-1))
2447
Victor Stinnere215d962012-10-06 23:03:36 +02002448 # test padding (width and/or precision)
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002449 check_format('123'.rjust(10, '0'),
2450 b'%010i', c_int(123))
2451 check_format('123'.rjust(100),
2452 b'%100i', c_int(123))
2453 check_format('123'.rjust(100, '0'),
2454 b'%.100i', c_int(123))
2455 check_format('123'.rjust(80, '0').rjust(100),
2456 b'%100.80i', c_int(123))
Victor Stinnere215d962012-10-06 23:03:36 +02002457
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002458 check_format('123'.rjust(10, '0'),
2459 b'%010u', c_uint(123))
2460 check_format('123'.rjust(100),
2461 b'%100u', c_uint(123))
2462 check_format('123'.rjust(100, '0'),
2463 b'%.100u', c_uint(123))
2464 check_format('123'.rjust(80, '0').rjust(100),
2465 b'%100.80u', c_uint(123))
Victor Stinnere215d962012-10-06 23:03:36 +02002466
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002467 check_format('123'.rjust(10, '0'),
2468 b'%010x', c_int(0x123))
2469 check_format('123'.rjust(100),
2470 b'%100x', c_int(0x123))
2471 check_format('123'.rjust(100, '0'),
2472 b'%.100x', c_int(0x123))
2473 check_format('123'.rjust(80, '0').rjust(100),
2474 b'%100.80x', c_int(0x123))
Victor Stinnere215d962012-10-06 23:03:36 +02002475
Victor Stinner6d970f42011-03-02 00:04:25 +00002476 # test %A
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002477 check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
2478 b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
Victor Stinner9a909002010-10-18 20:59:24 +00002479
Victor Stinner6d970f42011-03-02 00:04:25 +00002480 # test %V
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002481 check_format('repr=abc',
2482 b'repr=%V', 'abc', b'xyz')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002483
2484 # Test string decode from parameter of %s using utf-8.
2485 # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
2486 # '\u4eba\u6c11'
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002487 check_format('repr=\u4eba\u6c11',
2488 b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002489
2490 #Test replace error handler.
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002491 check_format('repr=abc\ufffd',
2492 b'repr=%V', None, b'abc\xff')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002493
Victor Stinner6d970f42011-03-02 00:04:25 +00002494 # not supported: copy the raw format string. these tests are just here
2495 # to check for crashs and should not be considered as specifications
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002496 check_format('%s',
2497 b'%1%s', b'abc')
2498 check_format('%1abc',
2499 b'%1abc')
2500 check_format('%+i',
2501 b'%+i', c_int(10))
2502 check_format('%.%s',
2503 b'%.%s', b'abc')
Victor Stinner6d970f42011-03-02 00:04:25 +00002504
Victor Stinner1c24bd02010-10-02 11:03:13 +00002505 # Test PyUnicode_AsWideChar()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002506 @support.cpython_only
Victor Stinner1c24bd02010-10-02 11:03:13 +00002507 def test_aswidechar(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002508 from _testcapi import unicode_aswidechar
Antoine Pitrou0662bc22010-11-22 16:19:04 +00002509 support.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002510 from ctypes import c_wchar, sizeof
2511
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002512 wchar, size = unicode_aswidechar('abcdef', 2)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002513 self.assertEqual(size, 2)
2514 self.assertEqual(wchar, 'ab')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002515
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002516 wchar, size = unicode_aswidechar('abc', 3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002517 self.assertEqual(size, 3)
2518 self.assertEqual(wchar, 'abc')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002519
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002520 wchar, size = unicode_aswidechar('abc', 4)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002521 self.assertEqual(size, 3)
2522 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002523
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002524 wchar, size = unicode_aswidechar('abc', 10)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002525 self.assertEqual(size, 3)
2526 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002527
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002528 wchar, size = unicode_aswidechar('abc\0def', 20)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002529 self.assertEqual(size, 7)
2530 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002531
Victor Stinner5593d8a2010-10-02 11:11:27 +00002532 nonbmp = chr(0x10ffff)
2533 if sizeof(c_wchar) == 2:
2534 buflen = 3
2535 nchar = 2
2536 else: # sizeof(c_wchar) == 4
2537 buflen = 2
2538 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002539 wchar, size = unicode_aswidechar(nonbmp, buflen)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002540 self.assertEqual(size, nchar)
2541 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00002542
Victor Stinner1c24bd02010-10-02 11:03:13 +00002543 # Test PyUnicode_AsWideCharString()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002544 @support.cpython_only
Victor Stinner1c24bd02010-10-02 11:03:13 +00002545 def test_aswidecharstring(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002546 from _testcapi import unicode_aswidecharstring
Antoine Pitrou0662bc22010-11-22 16:19:04 +00002547 support.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002548 from ctypes import c_wchar, sizeof
2549
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002550 wchar, size = unicode_aswidecharstring('abc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00002551 self.assertEqual(size, 3)
2552 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002553
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002554 wchar, size = unicode_aswidecharstring('abc\0def')
Ezio Melottib3aedd42010-11-20 19:04:17 +00002555 self.assertEqual(size, 7)
2556 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002557
Victor Stinner5593d8a2010-10-02 11:11:27 +00002558 nonbmp = chr(0x10ffff)
2559 if sizeof(c_wchar) == 2:
2560 nchar = 2
2561 else: # sizeof(c_wchar) == 4
2562 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002563 wchar, size = unicode_aswidecharstring(nonbmp)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002564 self.assertEqual(size, nchar)
2565 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00002566
Benjamin Peterson811c2f12011-09-30 21:31:21 -04002567 def test_subclass_add(self):
2568 class S(str):
2569 def __add__(self, o):
2570 return "3"
2571 self.assertEqual(S("4") + S("5"), "3")
2572 class S(str):
2573 def __iadd__(self, o):
2574 return "3"
2575 s = S("1")
2576 s += "4"
2577 self.assertEqual(s, "3")
2578
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002579 @support.cpython_only
Victor Stinner42bf7752011-11-21 22:52:58 +01002580 def test_encode_decimal(self):
2581 from _testcapi import unicode_encodedecimal
2582 self.assertEqual(unicode_encodedecimal('123'),
2583 b'123')
2584 self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
2585 b'3.14')
2586 self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2587 b' 3.14 ')
2588 self.assertRaises(UnicodeEncodeError,
2589 unicode_encodedecimal, "123\u20ac", "strict")
Victor Stinner6345be92011-11-25 20:09:01 +01002590 self.assertRaisesRegex(
2591 ValueError,
2592 "^'decimal' codec can't encode character",
2593 unicode_encodedecimal, "123\u20ac", "replace")
Victor Stinner42bf7752011-11-21 22:52:58 +01002594
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002595 @support.cpython_only
Victor Stinner42bf7752011-11-21 22:52:58 +01002596 def test_transform_decimal(self):
2597 from _testcapi import unicode_transformdecimaltoascii as transform_decimal
2598 self.assertEqual(transform_decimal('123'),
2599 '123')
2600 self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
2601 '3.14')
2602 self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2603 "\N{EM SPACE}3.14\N{EN SPACE}")
2604 self.assertEqual(transform_decimal('123\u20ac'),
2605 '123\u20ac')
2606
Victor Stinnerc814a382011-11-22 01:06:15 +01002607 def test_getnewargs(self):
2608 text = 'abc'
2609 args = text.__getnewargs__()
2610 self.assertIsNot(args[0], text)
2611 self.assertEqual(args[0], text)
2612 self.assertEqual(len(args), 1)
2613
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01002614 def test_resize(self):
2615 for length in range(1, 100, 7):
2616 # generate a fresh string (refcount=1)
2617 text = 'a' * length + 'b'
2618
Ezio Melotti51e243f2013-02-20 23:56:01 +02002619 with support.check_warnings(('unicode_internal codec has been '
2620 'deprecated', DeprecationWarning)):
2621 # fill wstr internal field
2622 abc = text.encode('unicode_internal')
2623 self.assertEqual(abc.decode('unicode_internal'), text)
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01002624
Ezio Melotti51e243f2013-02-20 23:56:01 +02002625 # resize text: wstr field must be cleared and then recomputed
2626 text += 'c'
2627 abcdef = text.encode('unicode_internal')
2628 self.assertNotEqual(abc, abcdef)
2629 self.assertEqual(abcdef.decode('unicode_internal'), text)
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01002630
Victor Stinner9fc59812013-04-08 22:34:43 +02002631 def test_compare(self):
2632 # Issue #17615
2633 N = 10
2634 ascii = 'a' * N
2635 ascii2 = 'z' * N
2636 latin = '\x80' * N
2637 latin2 = '\xff' * N
2638 bmp = '\u0100' * N
2639 bmp2 = '\uffff' * N
2640 astral = '\U00100000' * N
2641 astral2 = '\U0010ffff' * N
2642 strings = (
2643 ascii, ascii2,
2644 latin, latin2,
2645 bmp, bmp2,
2646 astral, astral2)
2647 for text1, text2 in itertools.combinations(strings, 2):
2648 equal = (text1 is text2)
2649 self.assertEqual(text1 == text2, equal)
2650 self.assertEqual(text1 != text2, not equal)
2651
2652 if equal:
2653 self.assertTrue(text1 <= text2)
2654 self.assertTrue(text1 >= text2)
2655
2656 # text1 is text2: duplicate strings to skip the "str1 == str2"
2657 # optimization in unicode_compare_eq() and really compare
2658 # character per character
2659 copy1 = duplicate_string(text1)
2660 copy2 = duplicate_string(text2)
2661 self.assertIsNot(copy1, copy2)
2662
2663 self.assertTrue(copy1 == copy2)
2664 self.assertFalse(copy1 != copy2)
2665
2666 self.assertTrue(copy1 <= copy2)
2667 self.assertTrue(copy2 >= copy2)
2668
2669 self.assertTrue(ascii < ascii2)
2670 self.assertTrue(ascii < latin)
2671 self.assertTrue(ascii < bmp)
2672 self.assertTrue(ascii < astral)
2673 self.assertFalse(ascii >= ascii2)
2674 self.assertFalse(ascii >= latin)
2675 self.assertFalse(ascii >= bmp)
2676 self.assertFalse(ascii >= astral)
2677
2678 self.assertFalse(latin < ascii)
2679 self.assertTrue(latin < latin2)
2680 self.assertTrue(latin < bmp)
2681 self.assertTrue(latin < astral)
2682 self.assertTrue(latin >= ascii)
2683 self.assertFalse(latin >= latin2)
2684 self.assertFalse(latin >= bmp)
2685 self.assertFalse(latin >= astral)
2686
2687 self.assertFalse(bmp < ascii)
2688 self.assertFalse(bmp < latin)
2689 self.assertTrue(bmp < bmp2)
2690 self.assertTrue(bmp < astral)
2691 self.assertTrue(bmp >= ascii)
2692 self.assertTrue(bmp >= latin)
2693 self.assertFalse(bmp >= bmp2)
2694 self.assertFalse(bmp >= astral)
2695
2696 self.assertFalse(astral < ascii)
2697 self.assertFalse(astral < latin)
2698 self.assertFalse(astral < bmp2)
2699 self.assertTrue(astral < astral2)
2700 self.assertTrue(astral >= ascii)
2701 self.assertTrue(astral >= latin)
2702 self.assertTrue(astral >= bmp2)
2703 self.assertFalse(astral >= astral2)
2704
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02002705 @support.cpython_only
2706 def test_pep393_utf8_caching_bug(self):
2707 # Issue #25709: Problem with string concatenation and utf-8 cache
2708 from _testcapi import getargs_s_hash
2709 for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
2710 s = ''
2711 for i in range(5):
2712 # Due to CPython specific optimization the 's' string can be
2713 # resized in-place.
2714 s += chr(k)
2715 # Parsing with the "s#" format code calls indirectly
2716 # PyUnicode_AsUTF8AndSize() which creates the UTF-8
2717 # encoded string cached in the Unicode object.
2718 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2719 # Check that the second call returns the same result
2720 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2721
Victor Stinner1c24bd02010-10-02 11:03:13 +00002722
Eric Smitha1eac722011-01-29 11:15:35 +00002723class StringModuleTest(unittest.TestCase):
2724 def test_formatter_parser(self):
2725 def parse(format):
2726 return list(_string.formatter_parser(format))
2727
2728 formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
2729 self.assertEqual(formatter, [
2730 ('prefix ', '2', '', 's'),
2731 ('xxx', '0', '^+10.3f', None),
2732 ('', 'obj.attr', '', 's'),
2733 (' ', 'z[0]', '10', 's'),
2734 ])
2735
2736 formatter = parse("prefix {} suffix")
2737 self.assertEqual(formatter, [
2738 ('prefix ', '', '', None),
2739 (' suffix', None, None, None),
2740 ])
2741
2742 formatter = parse("str")
2743 self.assertEqual(formatter, [
2744 ('str', None, None, None),
2745 ])
2746
2747 formatter = parse("")
2748 self.assertEqual(formatter, [])
2749
2750 formatter = parse("{0}")
2751 self.assertEqual(formatter, [
2752 ('', '0', '', None),
2753 ])
2754
2755 self.assertRaises(TypeError, _string.formatter_parser, 1)
2756
2757 def test_formatter_field_name_split(self):
2758 def split(name):
2759 items = list(_string.formatter_field_name_split(name))
2760 items[1] = list(items[1])
2761 return items
2762 self.assertEqual(split("obj"), ["obj", []])
2763 self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
2764 self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
2765 self.assertEqual(split("obj.arg[key1][key2]"), [
2766 "obj",
2767 [(True, 'arg'),
2768 (False, 'key1'),
2769 (False, 'key2'),
2770 ]])
2771 self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
2772
2773
Walter Dörwald28256f22003-01-19 16:59:20 +00002774if __name__ == "__main__":
Ezio Melotti0dceb562013-01-10 07:43:26 +02002775 unittest.main()