blob: c30310e1ae827489ef73e5102ebac40c05b59792 [file] [log] [blame]
Guido van Rossuma831cac2000-03-10 23:23:21 +00001""" Test script for the Unicode implementation.
2
Guido van Rossuma831cac2000-03-10 23:23:21 +00003Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
Marc-André Lemburg36619082001-01-17 19:11:13 +00007"""#"
Victor Stinner040e16e2011-11-15 22:44:05 +01008import _string
Guido van Rossum98297ee2007-11-06 21:34:58 +00009import codecs
Victor Stinner9fc59812013-04-08 22:34:43 +020010import itertools
Ethan Furman9ab74802014-03-21 06:38:46 -070011import operator
Guido van Rossum98297ee2007-11-06 21:34:58 +000012import struct
13import sys
14import unittest
15import warnings
Benjamin Petersonee8712c2008-05-20 21:35:26 +000016from test import support, string_tests
Guido van Rossuma831cac2000-03-10 23:23:21 +000017
Neal Norwitz430f68b2005-11-24 22:00:56 +000018# Error handling (bad decoder return)
19def search_function(encoding):
20 def decode1(input, errors="strict"):
21 return 42 # not a tuple
22 def encode1(input, errors="strict"):
23 return 42 # not a tuple
24 def encode2(input, errors="strict"):
25 return (42, 42) # no unicode
26 def decode2(input, errors="strict"):
27 return (42, 42) # no unicode
28 if encoding=="test.unicode1":
29 return (encode1, decode1, None, None)
30 elif encoding=="test.unicode2":
31 return (encode2, decode2, None, None)
32 else:
33 return None
34codecs.register(search_function)
35
Victor Stinner9fc59812013-04-08 22:34:43 +020036def duplicate_string(text):
37 """
38 Try to get a fresh clone of the specified text:
39 new object with a reference count of 1.
40
41 This is a best-effort: latin1 single letters and the empty
42 string ('') are singletons and cannot be cloned.
43 """
44 return text.encode().decode()
45
Serhiy Storchaka15095802015-11-25 15:47:01 +020046class StrSubclass(str):
47 pass
48
Brett Cannon226b2302010-03-20 22:22:22 +000049class UnicodeTest(string_tests.CommonTest,
50 string_tests.MixinStrUnicodeUserStringTest,
Ezio Melotti0dceb562013-01-10 07:43:26 +020051 string_tests.MixinStrUnicodeTest,
52 unittest.TestCase):
Brett Cannon226b2302010-03-20 22:22:22 +000053
Guido van Rossumef87d6e2007-05-02 19:09:54 +000054 type2test = str
Walter Dörwald0fd583c2003-02-21 12:53:50 +000055
56 def checkequalnofix(self, result, object, methodname, *args):
57 method = getattr(object, methodname)
58 realresult = method(*args)
59 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000060 self.assertTrue(type(realresult) is type(result))
Walter Dörwald0fd583c2003-02-21 12:53:50 +000061
62 # if the original is returned make sure that
63 # this doesn't happen with subclasses
64 if realresult is object:
Guido van Rossumef87d6e2007-05-02 19:09:54 +000065 class usub(str):
Walter Dörwald0fd583c2003-02-21 12:53:50 +000066 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000067 return 'usub(%r)' % str.__repr__(self)
Walter Dörwald0fd583c2003-02-21 12:53:50 +000068 object = usub(object)
69 method = getattr(object, methodname)
70 realresult = method(*args)
71 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000072 self.assertTrue(object is not realresult)
Guido van Rossume4874ae2001-09-21 15:36:41 +000073
Jeremy Hylton504de6b2003-10-06 05:08:26 +000074 def test_literals(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000075 self.assertEqual('\xff', '\u00ff')
76 self.assertEqual('\uffff', '\U0000ffff')
Guido van Rossum36e0a922007-07-20 04:05:57 +000077 self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
78 self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
79 self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
Benjamin Petersoncd76c272008-04-05 15:09:30 +000080 # raw strings should not have unicode escapes
Florent Xiclunaa87b3832010-09-13 02:28:18 +000081 self.assertNotEqual(r"\u0020", " ")
Jeremy Hylton504de6b2003-10-06 05:08:26 +000082
Georg Brandl559e5d72008-06-11 18:37:52 +000083 def test_ascii(self):
84 if not sys.platform.startswith('java'):
85 # Test basic sanity of repr()
86 self.assertEqual(ascii('abc'), "'abc'")
87 self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
88 self.assertEqual(ascii('ab\\'), "'ab\\\\'")
89 self.assertEqual(ascii('\\c'), "'\\\\c'")
90 self.assertEqual(ascii('\\'), "'\\\\'")
91 self.assertEqual(ascii('\n'), "'\\n'")
92 self.assertEqual(ascii('\r'), "'\\r'")
93 self.assertEqual(ascii('\t'), "'\\t'")
94 self.assertEqual(ascii('\b'), "'\\x08'")
95 self.assertEqual(ascii("'\""), """'\\'"'""")
96 self.assertEqual(ascii("'\""), """'\\'"'""")
97 self.assertEqual(ascii("'"), '''"'"''')
98 self.assertEqual(ascii('"'), """'"'""")
99 latin1repr = (
100 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
101 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
102 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
103 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
104 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
105 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
106 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
107 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
108 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
109 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
110 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
111 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
112 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
113 "\\xfe\\xff'")
114 testrepr = ascii(''.join(map(chr, range(256))))
115 self.assertEqual(testrepr, latin1repr)
116 # Test ascii works on wide unicode escapes without overflow.
117 self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
118 ascii("\U00010000" * 39 + "\uffff" * 4096))
119
120 class WrongRepr:
121 def __repr__(self):
122 return b'byte-repr'
123 self.assertRaises(TypeError, ascii, WrongRepr())
124
Walter Dörwald28256f22003-01-19 16:59:20 +0000125 def test_repr(self):
126 if not sys.platform.startswith('java'):
127 # Test basic sanity of repr()
Walter Dörwald67e83882007-05-05 12:26:27 +0000128 self.assertEqual(repr('abc'), "'abc'")
129 self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
130 self.assertEqual(repr('ab\\'), "'ab\\\\'")
131 self.assertEqual(repr('\\c'), "'\\\\c'")
132 self.assertEqual(repr('\\'), "'\\\\'")
133 self.assertEqual(repr('\n'), "'\\n'")
134 self.assertEqual(repr('\r'), "'\\r'")
135 self.assertEqual(repr('\t'), "'\\t'")
136 self.assertEqual(repr('\b'), "'\\x08'")
137 self.assertEqual(repr("'\""), """'\\'"'""")
138 self.assertEqual(repr("'\""), """'\\'"'""")
139 self.assertEqual(repr("'"), '''"'"''')
140 self.assertEqual(repr('"'), """'"'""")
Walter Dörwald28256f22003-01-19 16:59:20 +0000141 latin1repr = (
Walter Dörwald67e83882007-05-05 12:26:27 +0000142 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
Walter Dörwald28256f22003-01-19 16:59:20 +0000143 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
144 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
145 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
146 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
147 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
Georg Brandl559e5d72008-06-11 18:37:52 +0000148 "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
149 "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
150 "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
151 "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
152 "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
153 "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
154 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
155 "\xfe\xff'")
Guido van Rossum805365e2007-05-07 22:24:25 +0000156 testrepr = repr(''.join(map(chr, range(256))))
Walter Dörwald28256f22003-01-19 16:59:20 +0000157 self.assertEqual(testrepr, latin1repr)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000158 # Test repr works on wide unicode escapes without overflow.
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000159 self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
160 repr("\U00010000" * 39 + "\uffff" * 4096))
Walter Dörwald28256f22003-01-19 16:59:20 +0000161
Georg Brandl559e5d72008-06-11 18:37:52 +0000162 class WrongRepr:
163 def __repr__(self):
164 return b'byte-repr'
165 self.assertRaises(TypeError, repr, WrongRepr())
166
Guido van Rossum49d6b072006-08-17 21:11:47 +0000167 def test_iterators(self):
168 # Make sure unicode objects have an __iter__ method
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000169 it = "\u1111\u2222\u3333".__iter__()
170 self.assertEqual(next(it), "\u1111")
171 self.assertEqual(next(it), "\u2222")
172 self.assertEqual(next(it), "\u3333")
Georg Brandla18af4e2007-04-21 15:47:16 +0000173 self.assertRaises(StopIteration, next, it)
Guido van Rossum49d6b072006-08-17 21:11:47 +0000174
Walter Dörwald28256f22003-01-19 16:59:20 +0000175 def test_count(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000176 string_tests.CommonTest.test_count(self)
177 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000178 self.checkequalnofix(3, 'aaa', 'count', 'a')
179 self.checkequalnofix(0, 'aaa', 'count', 'b')
180 self.checkequalnofix(3, 'aaa', 'count', 'a')
181 self.checkequalnofix(0, 'aaa', 'count', 'b')
182 self.checkequalnofix(0, 'aaa', 'count', 'b')
183 self.checkequalnofix(1, 'aaa', 'count', 'a', -1)
184 self.checkequalnofix(3, 'aaa', 'count', 'a', -10)
185 self.checkequalnofix(2, 'aaa', 'count', 'a', 0, -1)
186 self.checkequalnofix(0, 'aaa', 'count', 'a', 0, -10)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200187 # test mixed kinds
188 self.checkequal(10, '\u0102' + 'a' * 10, 'count', 'a')
189 self.checkequal(10, '\U00100304' + 'a' * 10, 'count', 'a')
190 self.checkequal(10, '\U00100304' + '\u0102' * 10, 'count', '\u0102')
191 self.checkequal(0, 'a' * 10, 'count', '\u0102')
192 self.checkequal(0, 'a' * 10, 'count', '\U00100304')
193 self.checkequal(0, '\u0102' * 10, 'count', '\U00100304')
194 self.checkequal(10, '\u0102' + 'a_' * 10, 'count', 'a_')
195 self.checkequal(10, '\U00100304' + 'a_' * 10, 'count', 'a_')
196 self.checkequal(10, '\U00100304' + '\u0102_' * 10, 'count', '\u0102_')
197 self.checkequal(0, 'a' * 10, 'count', 'a\u0102')
198 self.checkequal(0, 'a' * 10, 'count', 'a\U00100304')
199 self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000200
Walter Dörwald28256f22003-01-19 16:59:20 +0000201 def test_find(self):
Antoine Pitrouc0bbe7d2011-10-08 22:41:35 +0200202 string_tests.CommonTest.test_find(self)
Antoine Pitrou2c3b2302011-10-11 20:29:21 +0200203 # test implementation details of the memchr fast path
204 self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102')
205 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201')
206 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120')
207 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220')
208 self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304')
209 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204')
210 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004')
211 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000212 self.checkequalnofix(0, 'abcdefghiabc', 'find', 'abc')
213 self.checkequalnofix(9, 'abcdefghiabc', 'find', 'abc', 1)
214 self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000215
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000216 self.assertRaises(TypeError, 'hello'.find)
217 self.assertRaises(TypeError, 'hello'.find, 42)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200218 # test mixed kinds
219 self.checkequal(100, '\u0102' * 100 + 'a', 'find', 'a')
220 self.checkequal(100, '\U00100304' * 100 + 'a', 'find', 'a')
221 self.checkequal(100, '\U00100304' * 100 + '\u0102', 'find', '\u0102')
222 self.checkequal(-1, 'a' * 100, 'find', '\u0102')
223 self.checkequal(-1, 'a' * 100, 'find', '\U00100304')
224 self.checkequal(-1, '\u0102' * 100, 'find', '\U00100304')
225 self.checkequal(100, '\u0102' * 100 + 'a_', 'find', 'a_')
226 self.checkequal(100, '\U00100304' * 100 + 'a_', 'find', 'a_')
227 self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'find', '\u0102_')
228 self.checkequal(-1, 'a' * 100, 'find', 'a\u0102')
229 self.checkequal(-1, 'a' * 100, 'find', 'a\U00100304')
230 self.checkequal(-1, '\u0102' * 100, 'find', '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000231
Walter Dörwald28256f22003-01-19 16:59:20 +0000232 def test_rfind(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000233 string_tests.CommonTest.test_rfind(self)
Antoine Pitrou2c3b2302011-10-11 20:29:21 +0200234 # test implementation details of the memrchr fast path
235 self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102')
236 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201')
237 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120')
238 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220')
239 self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304')
240 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204')
241 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000242 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000243 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', 'abc')
244 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
245 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200246 # test mixed kinds
247 self.checkequal(0, 'a' + '\u0102' * 100, 'rfind', 'a')
248 self.checkequal(0, 'a' + '\U00100304' * 100, 'rfind', 'a')
249 self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rfind', '\u0102')
250 self.checkequal(-1, 'a' * 100, 'rfind', '\u0102')
251 self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304')
252 self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304')
253 self.checkequal(0, '_a' + '\u0102' * 100, 'rfind', '_a')
254 self.checkequal(0, '_a' + '\U00100304' * 100, 'rfind', '_a')
255 self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rfind', '_\u0102')
256 self.checkequal(-1, 'a' * 100, 'rfind', '\u0102a')
257 self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304a')
258 self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304\u0102')
Guido van Rossum8b264542000-12-19 02:22:31 +0000259
Walter Dörwald28256f22003-01-19 16:59:20 +0000260 def test_index(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000261 string_tests.CommonTest.test_index(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000262 self.checkequalnofix(0, 'abcdefghiabc', 'index', '')
263 self.checkequalnofix(3, 'abcdefghiabc', 'index', 'def')
264 self.checkequalnofix(0, 'abcdefghiabc', 'index', 'abc')
265 self.checkequalnofix(9, 'abcdefghiabc', 'index', 'abc', 1)
266 self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
267 self.assertRaises(ValueError, 'abcdefghiab'.index, 'abc', 1)
268 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', 8)
269 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', -1)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200270 # test mixed kinds
271 self.checkequal(100, '\u0102' * 100 + 'a', 'index', 'a')
272 self.checkequal(100, '\U00100304' * 100 + 'a', 'index', 'a')
273 self.checkequal(100, '\U00100304' * 100 + '\u0102', 'index', '\u0102')
274 self.assertRaises(ValueError, ('a' * 100).index, '\u0102')
275 self.assertRaises(ValueError, ('a' * 100).index, '\U00100304')
276 self.assertRaises(ValueError, ('\u0102' * 100).index, '\U00100304')
277 self.checkequal(100, '\u0102' * 100 + 'a_', 'index', 'a_')
278 self.checkequal(100, '\U00100304' * 100 + 'a_', 'index', 'a_')
279 self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'index', '\u0102_')
280 self.assertRaises(ValueError, ('a' * 100).index, 'a\u0102')
281 self.assertRaises(ValueError, ('a' * 100).index, 'a\U00100304')
282 self.assertRaises(ValueError, ('\u0102' * 100).index, '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000283
Walter Dörwald28256f22003-01-19 16:59:20 +0000284 def test_rindex(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000285 string_tests.CommonTest.test_rindex(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000286 self.checkequalnofix(12, 'abcdefghiabc', 'rindex', '')
287 self.checkequalnofix(3, 'abcdefghiabc', 'rindex', 'def')
288 self.checkequalnofix(9, 'abcdefghiabc', 'rindex', 'abc')
289 self.checkequalnofix(0, 'abcdefghiabc', 'rindex', 'abc', 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000290
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000291 self.assertRaises(ValueError, 'abcdefghiabc'.rindex, 'hib')
292 self.assertRaises(ValueError, 'defghiabc'.rindex, 'def', 1)
293 self.assertRaises(ValueError, 'defghiabc'.rindex, 'abc', 0, -1)
294 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, 8)
295 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, -1)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200296 # test mixed kinds
297 self.checkequal(0, 'a' + '\u0102' * 100, 'rindex', 'a')
298 self.checkequal(0, 'a' + '\U00100304' * 100, 'rindex', 'a')
299 self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rindex', '\u0102')
300 self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102')
301 self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304')
302 self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304')
303 self.checkequal(0, '_a' + '\u0102' * 100, 'rindex', '_a')
304 self.checkequal(0, '_a' + '\U00100304' * 100, 'rindex', '_a')
305 self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rindex', '_\u0102')
306 self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102a')
307 self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304a')
308 self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304\u0102')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000309
Georg Brandlceee0772007-11-27 23:48:05 +0000310 def test_maketrans_translate(self):
311 # these work with plain translate()
312 self.checkequalnofix('bbbc', 'abababc', 'translate',
313 {ord('a'): None})
314 self.checkequalnofix('iiic', 'abababc', 'translate',
315 {ord('a'): None, ord('b'): ord('i')})
316 self.checkequalnofix('iiix', 'abababc', 'translate',
317 {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
318 self.checkequalnofix('c', 'abababc', 'translate',
319 {ord('a'): None, ord('b'): ''})
320 self.checkequalnofix('xyyx', 'xzx', 'translate',
321 {ord('z'): 'yy'})
Victor Stinner5a29f252014-04-05 00:17:51 +0200322
Georg Brandlceee0772007-11-27 23:48:05 +0000323 # this needs maketrans()
324 self.checkequalnofix('abababc', 'abababc', 'translate',
325 {'b': '<i>'})
326 tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
327 self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
328 # test alternative way of calling maketrans()
329 tbl = self.type2test.maketrans('abc', 'xyz', 'd')
330 self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
331
Victor Stinner5a29f252014-04-05 00:17:51 +0200332 # various tests switching from ASCII to latin1 or the opposite;
333 # same length, remove a letter, or replace with a longer string.
334 self.assertEqual("[a]".translate(str.maketrans('a', 'X')),
335 "[X]")
336 self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})),
337 "[X]")
338 self.assertEqual("[a]".translate(str.maketrans({'a': None})),
339 "[]")
340 self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})),
341 "[XXX]")
342 self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})),
343 "[\xe9]")
Victor Stinner33798672016-03-01 21:59:58 +0100344 self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '123'})),
345 "x123")
346 self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '\xe9'})),
347 "x\xe9")
348
349 # test non-ASCII (don't take the fast-path)
Victor Stinner5a29f252014-04-05 00:17:51 +0200350 self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})),
351 "[<\xe9>]")
352 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})),
353 "[a]")
354 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})),
355 "[]")
Victor Stinner33798672016-03-01 21:59:58 +0100356 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': '123'})),
357 "[123]")
358 self.assertEqual("[a\xe9]".translate(str.maketrans({'a': '<\u20ac>'})),
359 "[<\u20ac>\xe9]")
Victor Stinner5a29f252014-04-05 00:17:51 +0200360
Victor Stinner4ff33af2014-04-05 11:56:37 +0200361 # invalid Unicode characters
362 invalid_char = 0x10ffff+1
363 for before in "a\xe9\u20ac\U0010ffff":
364 mapping = str.maketrans({before: invalid_char})
365 text = "[%s]" % before
366 self.assertRaises(ValueError, text.translate, mapping)
367
368 # errors
Georg Brandlceee0772007-11-27 23:48:05 +0000369 self.assertRaises(TypeError, self.type2test.maketrans)
370 self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
371 self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
372 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
373 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
374 self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
375 self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000376
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000377 self.assertRaises(TypeError, 'hello'.translate)
Walter Dörwald67e83882007-05-05 12:26:27 +0000378 self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000379
Walter Dörwald28256f22003-01-19 16:59:20 +0000380 def test_split(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000381 string_tests.CommonTest.test_split(self)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000382
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000383 # Mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000384 self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
385 self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
386 self.checkequalnofix(['endcase ', ''], 'endcase test', 'split', 'test')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200387 # test mixed kinds
388 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
389 left *= 9
390 right *= 9
391 for delim in ('c', '\u0102', '\U00010302'):
392 self.checkequal([left + right],
393 left + right, 'split', delim)
394 self.checkequal([left, right],
395 left + delim + right, 'split', delim)
396 self.checkequal([left + right],
397 left + right, 'split', delim * 2)
398 self.checkequal([left, right],
399 left + delim * 2 + right, 'split', delim *2)
400
401 def test_rsplit(self):
402 string_tests.CommonTest.test_rsplit(self)
403 # test mixed kinds
404 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
405 left *= 9
406 right *= 9
407 for delim in ('c', '\u0102', '\U00010302'):
408 self.checkequal([left + right],
409 left + right, 'rsplit', delim)
410 self.checkequal([left, right],
411 left + delim + right, 'rsplit', delim)
412 self.checkequal([left + right],
413 left + right, 'rsplit', delim * 2)
414 self.checkequal([left, right],
415 left + delim * 2 + right, 'rsplit', delim *2)
416
417 def test_partition(self):
418 string_tests.MixinStrUnicodeUserStringTest.test_partition(self)
419 # test mixed kinds
Serhiy Storchaka48070c12015-03-29 19:21:02 +0300420 self.checkequal(('ABCDEFGH', '', ''), 'ABCDEFGH', 'partition', '\u4200')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200421 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
422 left *= 9
423 right *= 9
424 for delim in ('c', '\u0102', '\U00010302'):
425 self.checkequal((left + right, '', ''),
426 left + right, 'partition', delim)
427 self.checkequal((left, delim, right),
428 left + delim + right, 'partition', delim)
429 self.checkequal((left + right, '', ''),
430 left + right, 'partition', delim * 2)
431 self.checkequal((left, delim * 2, right),
432 left + delim * 2 + right, 'partition', delim * 2)
433
434 def test_rpartition(self):
435 string_tests.MixinStrUnicodeUserStringTest.test_rpartition(self)
436 # test mixed kinds
Serhiy Storchaka48070c12015-03-29 19:21:02 +0300437 self.checkequal(('', '', 'ABCDEFGH'), 'ABCDEFGH', 'rpartition', '\u4200')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200438 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
439 left *= 9
440 right *= 9
441 for delim in ('c', '\u0102', '\U00010302'):
442 self.checkequal(('', '', left + right),
443 left + right, 'rpartition', delim)
444 self.checkequal((left, delim, right),
445 left + delim + right, 'rpartition', delim)
446 self.checkequal(('', '', left + right),
447 left + right, 'rpartition', delim * 2)
448 self.checkequal((left, delim * 2, right),
449 left + delim * 2 + right, 'rpartition', delim * 2)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000450
Walter Dörwald28256f22003-01-19 16:59:20 +0000451 def test_join(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000452 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000453
Guido van Rossumf1044292007-09-27 18:01:22 +0000454 class MyWrapper:
455 def __init__(self, sval): self.sval = sval
456 def __str__(self): return self.sval
457
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000458 # mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000459 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
460 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
461 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
462 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
463 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
464 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
465 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
Guido van Rossum98297ee2007-11-06 21:34:58 +0000466 self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
467 self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
468 self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
469 self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
Marc-André Lemburge5034372000-08-08 08:04:29 +0000470
Walter Dörwald28256f22003-01-19 16:59:20 +0000471 def test_replace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000472 string_tests.CommonTest.test_replace(self)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000473
Walter Dörwald28256f22003-01-19 16:59:20 +0000474 # method call forwarded from str implementation because of unicode argument
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000475 self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
476 self.assertRaises(TypeError, 'replace'.replace, "r", 42)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200477 # test mixed kinds
478 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
479 left *= 9
480 right *= 9
481 for delim in ('c', '\u0102', '\U00010302'):
482 for repl in ('d', '\u0103', '\U00010303'):
483 self.checkequal(left + right,
484 left + right, 'replace', delim, repl)
485 self.checkequal(left + repl + right,
486 left + delim + right,
487 'replace', delim, repl)
488 self.checkequal(left + right,
489 left + right, 'replace', delim * 2, repl)
490 self.checkequal(left + repl + right,
491 left + delim * 2 + right,
492 'replace', delim * 2, repl)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000493
Victor Stinner59de0ee2011-10-07 10:01:28 +0200494 @support.cpython_only
495 def test_replace_id(self):
Victor Stinner1d972ad2011-10-07 13:31:46 +0200496 pattern = 'abc'
497 text = 'abc def'
498 self.assertIs(text.replace(pattern, pattern), text)
Victor Stinner59de0ee2011-10-07 10:01:28 +0200499
Guido van Rossum98297ee2007-11-06 21:34:58 +0000500 def test_bytes_comparison(self):
Brett Cannon226b2302010-03-20 22:22:22 +0000501 with support.check_warnings():
502 warnings.simplefilter('ignore', BytesWarning)
503 self.assertEqual('abc' == b'abc', False)
504 self.assertEqual('abc' != b'abc', True)
505 self.assertEqual('abc' == bytearray(b'abc'), False)
506 self.assertEqual('abc' != bytearray(b'abc'), True)
Brett Cannon40430012007-10-22 20:24:51 +0000507
Walter Dörwald28256f22003-01-19 16:59:20 +0000508 def test_comparison(self):
509 # Comparisons:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000510 self.assertEqual('abc', 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000511 self.assertTrue('abcd' > 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000512 self.assertTrue('abc' < 'abcd')
Walter Dörwald28256f22003-01-19 16:59:20 +0000513
514 if 0:
515 # Move these tests to a Unicode collation module test...
516 # Testing UTF-16 code point order comparisons...
517
518 # No surrogates, no fixup required.
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000519 self.assertTrue('\u0061' < '\u20ac')
Walter Dörwald28256f22003-01-19 16:59:20 +0000520 # Non surrogate below surrogate value, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000521 self.assertTrue('\u0061' < '\ud800\udc02')
Walter Dörwald28256f22003-01-19 16:59:20 +0000522
523 # Non surrogate above surrogate value, fixup required
524 def test_lecmp(s, s2):
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000525 self.assertTrue(s < s2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000526
527 def test_fixup(s):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000528 s2 = '\ud800\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000529 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000530 s2 = '\ud900\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000531 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000532 s2 = '\uda00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000533 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000534 s2 = '\udb00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000535 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000536 s2 = '\ud800\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000537 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000538 s2 = '\ud900\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000539 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000540 s2 = '\uda00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000541 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000542 s2 = '\udb00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000543 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000544 s2 = '\ud800\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000545 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000546 s2 = '\ud900\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000547 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000548 s2 = '\uda00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000549 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000550 s2 = '\udb00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000551 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000552 s2 = '\ud800\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000553 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000554 s2 = '\ud900\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000555 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000556 s2 = '\uda00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000557 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000558 s2 = '\udb00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000559 test_lecmp(s, s2)
560
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000561 test_fixup('\ue000')
562 test_fixup('\uff61')
Walter Dörwald28256f22003-01-19 16:59:20 +0000563
564 # Surrogates on both sides, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000565 self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
Walter Dörwald28256f22003-01-19 16:59:20 +0000566
Walter Dörwald28256f22003-01-19 16:59:20 +0000567 def test_islower(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000568 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000569 self.checkequalnofix(False, '\u1FFc', 'islower')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500570 self.assertFalse('\u2167'.islower())
571 self.assertTrue('\u2177'.islower())
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300572 # non-BMP, uppercase
573 self.assertFalse('\U00010401'.islower())
574 self.assertFalse('\U00010427'.islower())
575 # non-BMP, lowercase
576 self.assertTrue('\U00010429'.islower())
577 self.assertTrue('\U0001044E'.islower())
578 # non-BMP, non-cased
579 self.assertFalse('\U0001F40D'.islower())
580 self.assertFalse('\U0001F46F'.islower())
Walter Dörwald28256f22003-01-19 16:59:20 +0000581
582 def test_isupper(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000583 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
584 if not sys.platform.startswith('java'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000585 self.checkequalnofix(False, '\u1FFc', 'isupper')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500586 self.assertTrue('\u2167'.isupper())
587 self.assertFalse('\u2177'.isupper())
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300588 # non-BMP, uppercase
589 self.assertTrue('\U00010401'.isupper())
590 self.assertTrue('\U00010427'.isupper())
591 # non-BMP, lowercase
592 self.assertFalse('\U00010429'.isupper())
593 self.assertFalse('\U0001044E'.isupper())
594 # non-BMP, non-cased
595 self.assertFalse('\U0001F40D'.isupper())
596 self.assertFalse('\U0001F46F'.isupper())
Walter Dörwald28256f22003-01-19 16:59:20 +0000597
598 def test_istitle(self):
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300599 string_tests.MixinStrUnicodeUserStringTest.test_istitle(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000600 self.checkequalnofix(True, '\u1FFc', 'istitle')
601 self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
Walter Dörwald28256f22003-01-19 16:59:20 +0000602
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300603 # non-BMP, uppercase + lowercase
604 self.assertTrue('\U00010401\U00010429'.istitle())
605 self.assertTrue('\U00010427\U0001044E'.istitle())
606 # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
607 for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
608 self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
609
Walter Dörwald28256f22003-01-19 16:59:20 +0000610 def test_isspace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000611 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000612 self.checkequalnofix(True, '\u2000', 'isspace')
613 self.checkequalnofix(True, '\u200a', 'isspace')
614 self.checkequalnofix(False, '\u2014', 'isspace')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300615 # apparently there are no non-BMP spaces chars in Unicode 6
616 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
617 '\U0001F40D', '\U0001F46F']:
618 self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
619
620 def test_isalnum(self):
621 string_tests.MixinStrUnicodeUserStringTest.test_isalnum(self)
622 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
623 '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
624 self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
Walter Dörwald28256f22003-01-19 16:59:20 +0000625
626 def test_isalpha(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000627 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000628 self.checkequalnofix(True, '\u1FFc', 'isalpha')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300629 # non-BMP, cased
630 self.assertTrue('\U00010401'.isalpha())
631 self.assertTrue('\U00010427'.isalpha())
632 self.assertTrue('\U00010429'.isalpha())
633 self.assertTrue('\U0001044E'.isalpha())
634 # non-BMP, non-cased
635 self.assertFalse('\U0001F40D'.isalpha())
636 self.assertFalse('\U0001F46F'.isalpha())
Walter Dörwald28256f22003-01-19 16:59:20 +0000637
638 def test_isdecimal(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000639 self.checkequalnofix(False, '', 'isdecimal')
640 self.checkequalnofix(False, 'a', 'isdecimal')
641 self.checkequalnofix(True, '0', 'isdecimal')
642 self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
643 self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
644 self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
645 self.checkequalnofix(True, '0123456789', 'isdecimal')
646 self.checkequalnofix(False, '0123456789a', 'isdecimal')
Walter Dörwald28256f22003-01-19 16:59:20 +0000647
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000648 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000649
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300650 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
651 '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
652 self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
653 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
654 self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
655
Walter Dörwald28256f22003-01-19 16:59:20 +0000656 def test_isdigit(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000657 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000658 self.checkequalnofix(True, '\u2460', 'isdigit')
659 self.checkequalnofix(False, '\xbc', 'isdigit')
660 self.checkequalnofix(True, '\u0660', 'isdigit')
Walter Dörwald28256f22003-01-19 16:59:20 +0000661
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300662 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
663 '\U0001F40D', '\U0001F46F', '\U00011065']:
664 self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
665 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
666 self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
667
Walter Dörwald28256f22003-01-19 16:59:20 +0000668 def test_isnumeric(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000669 self.checkequalnofix(False, '', 'isnumeric')
670 self.checkequalnofix(False, 'a', 'isnumeric')
671 self.checkequalnofix(True, '0', 'isnumeric')
672 self.checkequalnofix(True, '\u2460', 'isnumeric')
673 self.checkequalnofix(True, '\xbc', 'isnumeric')
674 self.checkequalnofix(True, '\u0660', 'isnumeric')
675 self.checkequalnofix(True, '0123456789', 'isnumeric')
676 self.checkequalnofix(False, '0123456789a', 'isnumeric')
Walter Dörwald28256f22003-01-19 16:59:20 +0000677
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000678 self.assertRaises(TypeError, "abc".isnumeric, 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000679
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300680 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
681 '\U0001F40D', '\U0001F46F']:
682 self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
683 for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
684 '\U000104A0', '\U0001F107']:
685 self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
686
Martin v. Löwis47383402007-08-15 07:32:56 +0000687 def test_isidentifier(self):
688 self.assertTrue("a".isidentifier())
689 self.assertTrue("Z".isidentifier())
690 self.assertTrue("_".isidentifier())
691 self.assertTrue("b0".isidentifier())
692 self.assertTrue("bc".isidentifier())
693 self.assertTrue("b_".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000694 self.assertTrue("µ".isidentifier())
Benjamin Petersonf413b802011-08-12 22:17:18 -0500695 self.assertTrue("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000696
697 self.assertFalse(" ".isidentifier())
698 self.assertFalse("[".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000699 self.assertFalse("©".isidentifier())
Georg Brandld52429f2008-07-04 15:55:02 +0000700 self.assertFalse("0".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000701
Georg Brandl559e5d72008-06-11 18:37:52 +0000702 def test_isprintable(self):
703 self.assertTrue("".isprintable())
Benjamin Peterson09832742009-03-26 17:15:46 +0000704 self.assertTrue(" ".isprintable())
Georg Brandl559e5d72008-06-11 18:37:52 +0000705 self.assertTrue("abcdefg".isprintable())
706 self.assertFalse("abcdefg\n".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000707 # some defined Unicode character
708 self.assertTrue("\u0374".isprintable())
709 # undefined character
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +0000710 self.assertFalse("\u0378".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000711 # single surrogate character
Georg Brandl559e5d72008-06-11 18:37:52 +0000712 self.assertFalse("\ud800".isprintable())
713
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300714 self.assertTrue('\U0001F46F'.isprintable())
715 self.assertFalse('\U000E0020'.isprintable())
716
717 def test_surrogates(self):
718 for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
719 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
720 self.assertTrue(s.islower())
721 self.assertFalse(s.isupper())
722 self.assertFalse(s.istitle())
723 for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
724 'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
725 self.assertFalse(s.islower())
726 self.assertTrue(s.isupper())
727 self.assertTrue(s.istitle())
728
729 for meth_name in ('islower', 'isupper', 'istitle'):
730 meth = getattr(str, meth_name)
731 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
732 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
733
734 for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
735 'isdecimal', 'isnumeric',
736 'isidentifier', 'isprintable'):
737 meth = getattr(str, meth_name)
738 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
739 'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
740 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
741 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
742
743
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300744 def test_lower(self):
745 string_tests.CommonTest.test_lower(self)
746 self.assertEqual('\U00010427'.lower(), '\U0001044F')
747 self.assertEqual('\U00010427\U00010427'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300748 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300749 self.assertEqual('\U00010427\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300750 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300751 self.assertEqual('X\U00010427x\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300752 'x\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500753 self.assertEqual('fi'.lower(), 'fi')
754 self.assertEqual('\u0130'.lower(), '\u0069\u0307')
755 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
756 self.assertEqual('\u03a3'.lower(), '\u03c3')
757 self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
758 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
759 self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
760 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
761 self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
762 self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
763 self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
764 self.assertEqual('\u2177'.lower(), '\u2177')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300765
Benjamin Petersond5890c82012-01-14 13:23:30 -0500766 def test_casefold(self):
767 self.assertEqual('hello'.casefold(), 'hello')
768 self.assertEqual('hELlo'.casefold(), 'hello')
769 self.assertEqual('ß'.casefold(), 'ss')
770 self.assertEqual('fi'.casefold(), 'fi')
771 self.assertEqual('\u03a3'.casefold(), '\u03c3')
772 self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
Benjamin Peterson4eda9372012-08-05 15:05:34 -0700773 self.assertEqual('\u00b5'.casefold(), '\u03bc')
Benjamin Petersond5890c82012-01-14 13:23:30 -0500774
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300775 def test_upper(self):
776 string_tests.CommonTest.test_upper(self)
777 self.assertEqual('\U0001044F'.upper(), '\U00010427')
778 self.assertEqual('\U0001044F\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300779 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300780 self.assertEqual('\U00010427\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300781 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300782 self.assertEqual('X\U00010427x\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300783 'X\U00010427X\U00010427')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500784 self.assertEqual('fi'.upper(), 'FI')
785 self.assertEqual('\u0130'.upper(), '\u0130')
786 self.assertEqual('\u03a3'.upper(), '\u03a3')
787 self.assertEqual('ß'.upper(), 'SS')
788 self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
789 self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
790 self.assertEqual('\u2177'.upper(), '\u2167')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300791
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300792 def test_capitalize(self):
793 string_tests.CommonTest.test_capitalize(self)
794 self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
795 self.assertEqual('\U0001044F\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300796 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300797 self.assertEqual('\U00010427\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300798 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300799 self.assertEqual('\U0001044F\U00010427'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300800 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300801 self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300802 'X\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500803 self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
804 exp = '\u0399\u0308\u0300\u0069\u0307'
805 self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
806 self.assertEqual('finnish'.capitalize(), 'FInnish')
807 self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300808
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300809 def test_title(self):
810 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
811 self.assertEqual('\U0001044F'.title(), '\U00010427')
812 self.assertEqual('\U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300813 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300814 self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300815 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300816 self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300817 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300818 self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300819 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300820 self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300821 'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500822 self.assertEqual('fiNNISH'.title(), 'Finnish')
823 self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
824 self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300825
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300826 def test_swapcase(self):
827 string_tests.CommonTest.test_swapcase(self)
828 self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
829 self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
830 self.assertEqual('\U0001044F\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300831 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300832 self.assertEqual('\U00010427\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300833 '\U0001044F\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300834 self.assertEqual('\U0001044F\U00010427'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300835 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300836 self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300837 'x\U0001044FX\U00010427')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500838 self.assertEqual('fi'.swapcase(), 'FI')
839 self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
840 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
841 self.assertEqual('\u03a3'.swapcase(), '\u03c3')
842 self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
843 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
844 self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
845 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
846 self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
847 self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
848 self.assertEqual('\u03a3'.swapcase(), '\u03c3')
849 self.assertEqual('ß'.swapcase(), 'SS')
850 self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300851
Ezio Melottif84e01d2013-07-08 17:48:29 +0200852 def test_center(self):
853 string_tests.CommonTest.test_center(self)
854 self.assertEqual('x'.center(2, '\U0010FFFF'),
855 'x\U0010FFFF')
856 self.assertEqual('x'.center(3, '\U0010FFFF'),
857 '\U0010FFFFx\U0010FFFF')
858 self.assertEqual('x'.center(4, '\U0010FFFF'),
859 '\U0010FFFFx\U0010FFFF\U0010FFFF')
860
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400861 @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system")
Benjamin Peterson4d856892014-10-15 13:39:46 -0400862 @support.cpython_only
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400863 def test_case_operation_overflow(self):
864 # Issue #22643
Serhiy Storchaka411dfd82015-11-07 16:54:48 +0200865 size = 2**32//12 + 1
866 try:
867 s = "ü" * size
868 except MemoryError:
869 self.skipTest('no enough memory (%.0f MiB required)' % (size / 2**20))
870 try:
871 self.assertRaises(OverflowError, s.upper)
872 finally:
873 del s
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400874
Walter Dörwald28256f22003-01-19 16:59:20 +0000875 def test_contains(self):
876 # Testing Unicode contains method
Benjamin Peterson577473f2010-01-19 00:09:57 +0000877 self.assertIn('a', 'abdb')
878 self.assertIn('a', 'bdab')
879 self.assertIn('a', 'bdaba')
880 self.assertIn('a', 'bdba')
881 self.assertNotIn('a', 'bdb')
882 self.assertIn('a', 'bdba')
883 self.assertIn('a', ('a',1,None))
884 self.assertIn('a', (1,None,'a'))
885 self.assertIn('a', ('a',1,None))
886 self.assertIn('a', (1,None,'a'))
887 self.assertNotIn('a', ('x',1,'y'))
888 self.assertNotIn('a', ('x',1,None))
889 self.assertNotIn('abcd', 'abcxxxx')
890 self.assertIn('ab', 'abcd')
891 self.assertIn('ab', 'abc')
892 self.assertIn('ab', (1,None,'ab'))
893 self.assertIn('', 'abc')
894 self.assertIn('', '')
895 self.assertIn('', 'abc')
896 self.assertNotIn('\0', 'abc')
897 self.assertIn('\0', '\0abc')
898 self.assertIn('\0', 'abc\0')
899 self.assertIn('a', '\0abc')
900 self.assertIn('asdf', 'asdf')
901 self.assertNotIn('asdf', 'asd')
902 self.assertNotIn('asdf', '')
Walter Dörwald28256f22003-01-19 16:59:20 +0000903
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000904 self.assertRaises(TypeError, "abc".__contains__)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200905 # test mixed kinds
906 for fill in ('a', '\u0100', '\U00010300'):
907 fill *= 9
908 for delim in ('c', '\u0102', '\U00010302'):
909 self.assertNotIn(delim, fill)
910 self.assertIn(delim, fill + delim)
911 self.assertNotIn(delim * 2, fill)
912 self.assertIn(delim * 2, fill + delim * 2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000913
Serhiy Storchaka31b1c8b2013-06-12 09:20:44 +0300914 def test_issue18183(self):
915 '\U00010000\U00100000'.lower()
916 '\U00010000\U00100000'.casefold()
917 '\U00010000\U00100000'.upper()
918 '\U00010000\U00100000'.capitalize()
919 '\U00010000\U00100000'.title()
920 '\U00010000\U00100000'.swapcase()
921 '\U00100000'.center(3, '\U00010000')
922 '\U00100000'.ljust(3, '\U00010000')
923 '\U00100000'.rjust(3, '\U00010000')
924
Eric Smith8c663262007-08-25 02:26:07 +0000925 def test_format(self):
926 self.assertEqual(''.format(), '')
927 self.assertEqual('a'.format(), 'a')
928 self.assertEqual('ab'.format(), 'ab')
929 self.assertEqual('a{{'.format(), 'a{')
930 self.assertEqual('a}}'.format(), 'a}')
931 self.assertEqual('{{b'.format(), '{b')
932 self.assertEqual('}}b'.format(), '}b')
933 self.assertEqual('a{{b'.format(), 'a{b')
934
935 # examples from the PEP:
936 import datetime
937 self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
938 self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
939 "My name is Fred")
940 self.assertEqual("My name is {0} :-{{}}".format('Fred'),
941 "My name is Fred :-{}")
942
943 d = datetime.date(2007, 8, 18)
944 self.assertEqual("The year is {0.year}".format(d),
945 "The year is 2007")
946
Eric Smith8c663262007-08-25 02:26:07 +0000947 # classes we'll use for testing
948 class C:
949 def __init__(self, x=100):
950 self._x = x
951 def __format__(self, spec):
952 return spec
953
954 class D:
955 def __init__(self, x):
956 self.x = x
957 def __format__(self, spec):
958 return str(self.x)
959
960 # class with __str__, but no __format__
961 class E:
962 def __init__(self, x):
963 self.x = x
964 def __str__(self):
965 return 'E(' + self.x + ')'
966
967 # class with __repr__, but no __format__ or __str__
968 class F:
969 def __init__(self, x):
970 self.x = x
971 def __repr__(self):
972 return 'F(' + self.x + ')'
973
974 # class with __format__ that forwards to string, for some format_spec's
975 class G:
976 def __init__(self, x):
977 self.x = x
978 def __str__(self):
979 return "string is " + self.x
980 def __format__(self, format_spec):
981 if format_spec == 'd':
982 return 'G(' + self.x + ')'
983 return object.__format__(self, format_spec)
984
Eric Smith739e2ad2007-08-27 19:07:22 +0000985 class I(datetime.date):
986 def __format__(self, format_spec):
987 return self.strftime(format_spec)
988
Eric Smith185e30c2007-08-30 22:23:08 +0000989 class J(int):
990 def __format__(self, format_spec):
991 return int.__format__(self * 2, format_spec)
992
Eric Smith8c663262007-08-25 02:26:07 +0000993
994 self.assertEqual(''.format(), '')
995 self.assertEqual('abc'.format(), 'abc')
996 self.assertEqual('{0}'.format('abc'), 'abc')
997 self.assertEqual('{0:}'.format('abc'), 'abc')
998# self.assertEqual('{ 0 }'.format('abc'), 'abc')
999 self.assertEqual('X{0}'.format('abc'), 'Xabc')
1000 self.assertEqual('{0}X'.format('abc'), 'abcX')
1001 self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
1002 self.assertEqual('{1}'.format(1, 'abc'), 'abc')
1003 self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
1004 self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
1005 self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
1006 self.assertEqual('{0}'.format(-15), '-15')
1007 self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
1008 self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
1009 self.assertEqual('{{'.format(), '{')
1010 self.assertEqual('}}'.format(), '}')
1011 self.assertEqual('{{}}'.format(), '{}')
1012 self.assertEqual('{{x}}'.format(), '{x}')
1013 self.assertEqual('{{{0}}}'.format(123), '{123}')
1014 self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
1015 self.assertEqual('}}{{'.format(), '}{')
1016 self.assertEqual('}}x{{'.format(), '}x{')
1017
Eric Smith7ade6482007-08-26 22:27:13 +00001018 # weird field names
1019 self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
1020 self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
Eric Smith4cb4e4e2007-09-03 08:40:29 +00001021 self.assertEqual("{0[ ]}".format({' ':3}), '3')
Eric Smith7ade6482007-08-26 22:27:13 +00001022
Eric Smith8c663262007-08-25 02:26:07 +00001023 self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
1024 self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
1025 self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
1026 self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
1027 self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
1028 self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
1029 self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
1030
Eric Smith8c663262007-08-25 02:26:07 +00001031 # strings
1032 self.assertEqual('{0:.3s}'.format('abc'), 'abc')
1033 self.assertEqual('{0:.3s}'.format('ab'), 'ab')
1034 self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
1035 self.assertEqual('{0:.0s}'.format('abcdef'), '')
1036 self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
1037 self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
1038 self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
1039 self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
1040 self.assertEqual('{0:x<0s}'.format('result'), 'result')
1041 self.assertEqual('{0:x<5s}'.format('result'), 'result')
1042 self.assertEqual('{0:x<6s}'.format('result'), 'result')
1043 self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
1044 self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
1045 self.assertEqual('{0: <7s}'.format('result'), 'result ')
1046 self.assertEqual('{0:<7s}'.format('result'), 'result ')
1047 self.assertEqual('{0:>7s}'.format('result'), ' result')
1048 self.assertEqual('{0:>8s}'.format('result'), ' result')
1049 self.assertEqual('{0:^8s}'.format('result'), ' result ')
1050 self.assertEqual('{0:^9s}'.format('result'), ' result ')
1051 self.assertEqual('{0:^10s}'.format('result'), ' result ')
1052 self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
1053 self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
1054 self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
1055
Eric V. Smith2ea97122014-04-14 11:55:10 -04001056 # issue 12546: use \x00 as a fill character
1057 self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
1058 self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
1059 self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
1060 self.assertEqual('{0:^6s}'.format('foo'), ' foo ')
1061
1062 self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
1063 self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
1064 self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
1065 self.assertEqual('{0:<6}'.format(3), '3 ')
1066
1067 self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
1068 self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
1069 self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
1070 self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
1071
1072 self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
1073 self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
1074 self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
1075 self.assertEqual('{0:^12}'.format(3+2.0j), ' (3+2j) ')
1076
Eric Smith8c663262007-08-25 02:26:07 +00001077 # format specifiers for user defined type
1078 self.assertEqual('{0:abc}'.format(C()), 'abc')
1079
Georg Brandld52429f2008-07-04 15:55:02 +00001080 # !r, !s and !a coercions
Eric Smith8c663262007-08-25 02:26:07 +00001081 self.assertEqual('{0!s}'.format('Hello'), 'Hello')
1082 self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
1083 self.assertEqual('{0!s:15}'.format('Hello'), 'Hello ')
1084 self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello ')
1085 self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
1086 self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
1087 self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +00001088 self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +00001089 self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'") # printable
1090 self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
Georg Brandl559e5d72008-06-11 18:37:52 +00001091 self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +00001092 self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +00001093 self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
Georg Brandl559e5d72008-06-11 18:37:52 +00001094 self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
1095 self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
Georg Brandld52429f2008-07-04 15:55:02 +00001096 self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
Eric Smith8c663262007-08-25 02:26:07 +00001097
Eric Smith8c663262007-08-25 02:26:07 +00001098 # test fallback to object.__format__
1099 self.assertEqual('{0}'.format({}), '{}')
1100 self.assertEqual('{0}'.format([]), '[]')
1101 self.assertEqual('{0}'.format([1]), '[1]')
Eric Smithe4d63172010-09-13 20:48:43 +00001102
Eric Smith8c663262007-08-25 02:26:07 +00001103 self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
Eric Smith8c663262007-08-25 02:26:07 +00001104 self.assertEqual('{0!s}'.format(G('data')), 'string is data')
1105
Andrew Svetlov2cd8ce42012-12-23 14:27:17 +02001106 self.assertRaises(TypeError, '{0:^10}'.format, E('data'))
1107 self.assertRaises(TypeError, '{0:^10s}'.format, E('data'))
1108 self.assertRaises(TypeError, '{0:>15s}'.format, G('data'))
Eric Smithe4d63172010-09-13 20:48:43 +00001109
Eric Smith739e2ad2007-08-27 19:07:22 +00001110 self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
1111 month=8,
1112 day=27)),
1113 "date: 2007-08-27")
1114
Eric Smith185e30c2007-08-30 22:23:08 +00001115 # test deriving from a builtin type and overriding __format__
1116 self.assertEqual("{0}".format(J(10)), "20")
1117
1118
Eric Smith8c663262007-08-25 02:26:07 +00001119 # string format specifiers
1120 self.assertEqual('{0:}'.format('a'), 'a')
1121
1122 # computed format specifiers
1123 self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
1124 self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
1125 self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
1126 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello ')
1127 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello ')
1128
1129 # test various errors
1130 self.assertRaises(ValueError, '{'.format)
1131 self.assertRaises(ValueError, '}'.format)
1132 self.assertRaises(ValueError, 'a{'.format)
1133 self.assertRaises(ValueError, 'a}'.format)
1134 self.assertRaises(ValueError, '{a'.format)
1135 self.assertRaises(ValueError, '}a'.format)
Eric Smith11529192007-09-04 23:04:22 +00001136 self.assertRaises(IndexError, '{0}'.format)
1137 self.assertRaises(IndexError, '{1}'.format, 'abc')
1138 self.assertRaises(KeyError, '{x}'.format)
Eric Smith8c663262007-08-25 02:26:07 +00001139 self.assertRaises(ValueError, "}{".format)
Eric Smith8c663262007-08-25 02:26:07 +00001140 self.assertRaises(ValueError, "abc{0:{}".format)
1141 self.assertRaises(ValueError, "{0".format)
Eric Smith11529192007-09-04 23:04:22 +00001142 self.assertRaises(IndexError, "{0.}".format)
1143 self.assertRaises(ValueError, "{0.}".format, 0)
Benjamin Peterson4d944742013-05-17 18:22:31 -05001144 self.assertRaises(ValueError, "{0[}".format)
Eric Smith4cb4e4e2007-09-03 08:40:29 +00001145 self.assertRaises(ValueError, "{0[}".format, [])
Eric Smith11529192007-09-04 23:04:22 +00001146 self.assertRaises(KeyError, "{0]}".format)
1147 self.assertRaises(ValueError, "{0.[]}".format, 0)
Eric Smith7ade6482007-08-26 22:27:13 +00001148 self.assertRaises(ValueError, "{0..foo}".format, 0)
Eric Smith11529192007-09-04 23:04:22 +00001149 self.assertRaises(ValueError, "{0[0}".format, 0)
1150 self.assertRaises(ValueError, "{0[0:foo}".format, 0)
1151 self.assertRaises(KeyError, "{c]}".format)
1152 self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
1153 self.assertRaises(ValueError, "{0}}".format, 0)
1154 self.assertRaises(KeyError, "{foo}".format, bar=3)
Eric Smith8c663262007-08-25 02:26:07 +00001155 self.assertRaises(ValueError, "{0!x}".format, 3)
Eric Smith11529192007-09-04 23:04:22 +00001156 self.assertRaises(ValueError, "{0!}".format, 0)
1157 self.assertRaises(ValueError, "{0!rs}".format, 0)
Eric Smith8c663262007-08-25 02:26:07 +00001158 self.assertRaises(ValueError, "{!}".format)
Eric Smith8ec90442009-03-14 12:29:34 +00001159 self.assertRaises(IndexError, "{:}".format)
1160 self.assertRaises(IndexError, "{:s}".format)
1161 self.assertRaises(IndexError, "{}".format)
Benjamin Peterson59a1b2f2010-06-07 22:31:26 +00001162 big = "23098475029384702983476098230754973209482573"
1163 self.assertRaises(ValueError, ("{" + big + "}").format)
1164 self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
Eric Smith8c663262007-08-25 02:26:07 +00001165
Eric Smith41669ca2009-05-23 14:23:22 +00001166 # issue 6089
1167 self.assertRaises(ValueError, "{0[0]x}".format, [None])
1168 self.assertRaises(ValueError, "{0[0](10)}".format, [None])
1169
Eric Smith8c663262007-08-25 02:26:07 +00001170 # can't have a replacement on the field name portion
1171 self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
1172
1173 # exceed maximum recursion depth
1174 self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
1175 self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1176 0, 1, 2, 3, 4, 5, 6, 7)
1177
1178 # string format spec errors
1179 self.assertRaises(ValueError, "{0:-s}".format, '')
1180 self.assertRaises(ValueError, format, "", "-")
1181 self.assertRaises(ValueError, "{0:=s}".format, '')
1182
Eric Smithb1ebcc62008-07-15 13:02:41 +00001183 # Alternate formatting is not supported
1184 self.assertRaises(ValueError, format, '', '#')
1185 self.assertRaises(ValueError, format, '', '#20')
1186
Victor Stinnerece58de2012-04-23 23:36:38 +02001187 # Non-ASCII
1188 self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"),
1189 'ABC\u0410\u0411\u0412')
1190 self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"),
1191 'ABC')
1192 self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"),
1193 '')
1194
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001195 self.assertEqual("{[{}]}".format({"{}": 5}), "5")
Benjamin Peterson4d944742013-05-17 18:22:31 -05001196 self.assertEqual("{[{}]}".format({"{}" : "a"}), "a")
1197 self.assertEqual("{[{]}".format({"{" : "a"}), "a")
1198 self.assertEqual("{[}]}".format({"}" : "a"}), "a")
1199 self.assertEqual("{[[]}".format({"[" : "a"}), "a")
1200 self.assertEqual("{[!]}".format({"!" : "a"}), "a")
1201 self.assertRaises(ValueError, "{a{}b}".format, 42)
1202 self.assertRaises(ValueError, "{a{b}".format, 42)
1203 self.assertRaises(ValueError, "{[}".format, 42)
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001204
Benjamin Peterson0ee22bf2013-11-26 19:22:36 -06001205 self.assertEqual("0x{:0{:d}X}".format(0x0,16), "0x0000000000000000")
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001206
Eric Smith27bbca62010-11-04 17:06:58 +00001207 def test_format_map(self):
1208 self.assertEqual(''.format_map({}), '')
1209 self.assertEqual('a'.format_map({}), 'a')
1210 self.assertEqual('ab'.format_map({}), 'ab')
1211 self.assertEqual('a{{'.format_map({}), 'a{')
1212 self.assertEqual('a}}'.format_map({}), 'a}')
1213 self.assertEqual('{{b'.format_map({}), '{b')
1214 self.assertEqual('}}b'.format_map({}), '}b')
1215 self.assertEqual('a{{b'.format_map({}), 'a{b')
1216
1217 # using mappings
1218 class Mapping(dict):
1219 def __missing__(self, key):
1220 return key
1221 self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
1222 self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
1223
1224 class InternalMapping:
1225 def __init__(self):
1226 self.mapping = {'a': 'hello'}
1227 def __getitem__(self, key):
1228 return self.mapping[key]
1229 self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
1230
1231
Eric Smith27bbca62010-11-04 17:06:58 +00001232 class C:
1233 def __init__(self, x=100):
1234 self._x = x
1235 def __format__(self, spec):
1236 return spec
Eric Smith27bbca62010-11-04 17:06:58 +00001237 self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
1238
1239 # test various errors
Eric V. Smithedbb6ca2012-03-12 15:16:22 -07001240 self.assertRaises(TypeError, ''.format_map)
1241 self.assertRaises(TypeError, 'a'.format_map)
1242
1243 self.assertRaises(ValueError, '{'.format_map, {})
1244 self.assertRaises(ValueError, '}'.format_map, {})
1245 self.assertRaises(ValueError, 'a{'.format_map, {})
1246 self.assertRaises(ValueError, 'a}'.format_map, {})
1247 self.assertRaises(ValueError, '{a'.format_map, {})
1248 self.assertRaises(ValueError, '}a'.format_map, {})
Eric Smith27bbca62010-11-04 17:06:58 +00001249
Eric V. Smith12ebefc2011-07-18 14:03:41 -04001250 # issue #12579: can't supply positional params to format_map
1251 self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
1252 self.assertRaises(ValueError, '{}'.format_map, 'a')
1253 self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
1254
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001255 def test_format_huge_precision(self):
1256 format_string = ".{}f".format(sys.maxsize + 1)
1257 with self.assertRaises(ValueError):
1258 result = format(2.34, format_string)
1259
1260 def test_format_huge_width(self):
1261 format_string = "{}f".format(sys.maxsize + 1)
1262 with self.assertRaises(ValueError):
1263 result = format(2.34, format_string)
1264
1265 def test_format_huge_item_number(self):
1266 format_string = "{{{}:.6f}}".format(sys.maxsize + 1)
1267 with self.assertRaises(ValueError):
1268 result = format_string.format(2.34)
1269
Eric Smith8ec90442009-03-14 12:29:34 +00001270 def test_format_auto_numbering(self):
1271 class C:
1272 def __init__(self, x=100):
1273 self._x = x
1274 def __format__(self, spec):
1275 return spec
1276
1277 self.assertEqual('{}'.format(10), '10')
1278 self.assertEqual('{:5}'.format('s'), 's ')
1279 self.assertEqual('{!r}'.format('s'), "'s'")
1280 self.assertEqual('{._x}'.format(C(10)), '10')
1281 self.assertEqual('{[1]}'.format([1, 2]), '2')
1282 self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
1283 self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
1284
1285 self.assertEqual('a{:{}}b'.format('x', '^10'), 'a x b')
1286 self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
1287
1288 # can't mix and match numbering and auto-numbering
1289 self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
1290 self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
1291 self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
1292 self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
1293
1294 # can mix and match auto-numbering and named
1295 self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
1296 self.assertEqual('{}{f}'.format(4, f='test'), '4test')
1297 self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
1298 self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
1299
Walter Dörwald28256f22003-01-19 16:59:20 +00001300 def test_formatting(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +00001301 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
Walter Dörwald28256f22003-01-19 16:59:20 +00001302 # Testing Unicode formatting strings...
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001303 self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
1304 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000, 3.00')
1305 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000, 3.00')
1306 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000, 3.50')
1307 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000, 3.57')
1308 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
Walter Dörwald28256f22003-01-19 16:59:20 +00001309 if not sys.platform.startswith('java'):
Walter Dörwald67e83882007-05-05 12:26:27 +00001310 self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
Georg Brandl559e5d72008-06-11 18:37:52 +00001311 self.assertEqual("%r" % ("\u1234",), "'\u1234'")
1312 self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001313 self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
1314 self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
Walter Dörwald56fbcb52003-03-31 18:18:41 +00001315
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001316 self.assertEqual('%c' % 0x1234, '\u1234')
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00001317 self.assertEqual('%c' % 0x21483, '\U00021483')
1318 self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
1319 self.assertEqual('%c' % '\U00021483', '\U00021483')
1320 self.assertRaises(TypeError, "%c".__mod__, "aa")
Stefan Krah99212f62010-07-19 17:58:26 +00001321 self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -07001322 self.assertRaises(TypeError, "%i".__mod__, "aa")
Walter Dörwald28256f22003-01-19 16:59:20 +00001323
1324 # formatting jobs delegated from the string implementation:
Walter Dörwald28256f22003-01-19 16:59:20 +00001325 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001326 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1327 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1328 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1329 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1330 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1331 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
1332 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
1333 self.assertEqual('...%s...' % "abc", '...abc...')
1334 self.assertEqual('%*s' % (5,'abc',), ' abc')
1335 self.assertEqual('%*s' % (-5,'abc',), 'abc ')
1336 self.assertEqual('%*.*s' % (5,2,'abc',), ' ab')
1337 self.assertEqual('%*.*s' % (5,3,'abc',), ' abc')
1338 self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10 abc')
1339 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103 abc')
1340 self.assertEqual('%c' % 'a', 'a')
Neil Schemenauercf52c072005-08-12 17:34:58 +00001341 class Wrapper:
1342 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001343 return '\u1234'
1344 self.assertEqual('%s' % Wrapper(), '\u1234')
Walter Dörwald28256f22003-01-19 16:59:20 +00001345
Eric Smith741191f2009-05-06 13:08:15 +00001346 # issue 3382
1347 NAN = float('nan')
1348 INF = float('inf')
1349 self.assertEqual('%f' % NAN, 'nan')
1350 self.assertEqual('%F' % NAN, 'NAN')
1351 self.assertEqual('%f' % INF, 'inf')
1352 self.assertEqual('%F' % INF, 'INF')
1353
Victor Stinnerf59c28c2012-05-09 03:24:14 +02001354 # PEP 393
1355 self.assertEqual('%.1s' % "a\xe9\u20ac", 'a')
1356 self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9')
1357
Ethan Furmandf3ed242014-01-05 06:50:30 -08001358 #issue 19995
Ethan Furman9ab74802014-03-21 06:38:46 -07001359 class PseudoInt:
Ethan Furmandf3ed242014-01-05 06:50:30 -08001360 def __init__(self, value):
1361 self.value = int(value)
1362 def __int__(self):
1363 return self.value
1364 def __index__(self):
1365 return self.value
Ethan Furman9ab74802014-03-21 06:38:46 -07001366 class PseudoFloat:
Ethan Furmandf3ed242014-01-05 06:50:30 -08001367 def __init__(self, value):
1368 self.value = float(value)
1369 def __int__(self):
1370 return int(self.value)
Ethan Furman9ab74802014-03-21 06:38:46 -07001371 pi = PseudoFloat(3.1415)
1372 letter_m = PseudoInt(109)
Antoine Pitroueb168042014-01-10 00:02:38 +01001373 self.assertEqual('%x' % 42, '2a')
1374 self.assertEqual('%X' % 15, 'F')
1375 self.assertEqual('%o' % 9, '11')
1376 self.assertEqual('%c' % 109, 'm')
1377 self.assertEqual('%x' % letter_m, '6d')
1378 self.assertEqual('%X' % letter_m, '6D')
1379 self.assertEqual('%o' % letter_m, '155')
1380 self.assertEqual('%c' % letter_m, 'm')
Ethan Furman9ab74802014-03-21 06:38:46 -07001381 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14),
1382 self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11),
1383 self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79),
1384 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi),
1385 self.assertRaises(TypeError, operator.mod, '%c', pi),
Ethan Furmandf3ed242014-01-05 06:50:30 -08001386
Ethan Furmanfb137212013-08-31 10:18:55 -07001387 def test_formatting_with_enum(self):
1388 # issue18780
1389 import enum
1390 class Float(float, enum.Enum):
1391 PI = 3.1415926
1392 class Int(enum.IntEnum):
1393 IDES = 15
1394 class Str(str, enum.Enum):
1395 ABC = 'abc'
1396 # Testing Unicode formatting strings...
Ethan Furman13bdfa72013-08-31 12:48:51 -07001397 self.assertEqual("%s, %s" % (Str.ABC, Str.ABC),
1398 'Str.ABC, Str.ABC')
1399 self.assertEqual("%s, %s, %d, %i, %u, %f, %5.2f" %
1400 (Str.ABC, Str.ABC,
1401 Int.IDES, Int.IDES, Int.IDES,
1402 Float.PI, Float.PI),
1403 'Str.ABC, Str.ABC, 15, 15, 15, 3.141593, 3.14')
Ethan Furmanfb137212013-08-31 10:18:55 -07001404
1405 # formatting jobs delegated from the string implementation:
Ethan Furman13bdfa72013-08-31 12:48:51 -07001406 self.assertEqual('...%(foo)s...' % {'foo':Str.ABC},
1407 '...Str.ABC...')
1408 self.assertEqual('...%(foo)s...' % {'foo':Int.IDES},
1409 '...Int.IDES...')
1410 self.assertEqual('...%(foo)i...' % {'foo':Int.IDES},
1411 '...15...')
1412 self.assertEqual('...%(foo)d...' % {'foo':Int.IDES},
1413 '...15...')
1414 self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI},
1415 '...15...')
1416 self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123},
1417 '...3.141593...')
Ethan Furmanfb137212013-08-31 10:18:55 -07001418
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001419 def test_formatting_huge_precision(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001420 format_string = "%.{}f".format(sys.maxsize + 1)
1421 with self.assertRaises(ValueError):
1422 result = format_string % 2.34
1423
1424 @support.cpython_only
1425 def test_formatting_huge_precision_c_limits(self):
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001426 from _testcapi import INT_MAX
1427 format_string = "%.{}f".format(INT_MAX + 1)
1428 with self.assertRaises(ValueError):
1429 result = format_string % 2.34
1430
1431 def test_formatting_huge_width(self):
1432 format_string = "%{}f".format(sys.maxsize + 1)
1433 with self.assertRaises(ValueError):
1434 result = format_string % 2.34
1435
Ezio Melottiba42fd52011-04-26 06:09:45 +03001436 def test_startswith_endswith_errors(self):
1437 for meth in ('foo'.startswith, 'foo'.endswith):
Ezio Melottif2b3f782011-04-26 06:40:59 +03001438 with self.assertRaises(TypeError) as cm:
Ezio Melottiba42fd52011-04-26 06:09:45 +03001439 meth(['f'])
Ezio Melottif2b3f782011-04-26 06:40:59 +03001440 exc = str(cm.exception)
Ezio Melottiba42fd52011-04-26 06:09:45 +03001441 self.assertIn('str', exc)
1442 self.assertIn('tuple', exc)
1443
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001444 @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
Georg Brandlda6b1072006-01-20 17:48:54 +00001445 def test_format_float(self):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001446 # should not format with a comma, but always with C locale
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001447 self.assertEqual('1.0', '%.1f' % 1.0)
Georg Brandlda6b1072006-01-20 17:48:54 +00001448
Walter Dörwald28256f22003-01-19 16:59:20 +00001449 def test_constructor(self):
1450 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1451
1452 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001453 str('unicode remains unicode'),
1454 'unicode remains unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001455 )
1456
Victor Stinner07ac3eb2011-10-01 16:16:43 +02001457 for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
Serhiy Storchaka15095802015-11-25 15:47:01 +02001458 subclass = StrSubclass(text)
Victor Stinner07ac3eb2011-10-01 16:16:43 +02001459 self.assertEqual(str(subclass), text)
1460 self.assertEqual(len(subclass), len(text))
1461 if text == 'ascii':
1462 self.assertEqual(subclass.encode('ascii'), b'ascii')
1463 self.assertEqual(subclass.encode('utf-8'), b'ascii')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001464
Walter Dörwald28256f22003-01-19 16:59:20 +00001465 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001466 str('strings are converted to unicode'),
1467 'strings are converted to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001468 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001469
Walter Dörwald28256f22003-01-19 16:59:20 +00001470 class StringCompat:
1471 def __init__(self, x):
1472 self.x = x
1473 def __str__(self):
1474 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001475
Walter Dörwald28256f22003-01-19 16:59:20 +00001476 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001477 str(StringCompat('__str__ compatible objects are recognized')),
1478 '__str__ compatible objects are recognized'
Walter Dörwald28256f22003-01-19 16:59:20 +00001479 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001480
Walter Dörwald28256f22003-01-19 16:59:20 +00001481 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001482
Walter Dörwald28256f22003-01-19 16:59:20 +00001483 o = StringCompat('unicode(obj) is compatible to str()')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001484 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Walter Dörwald28256f22003-01-19 16:59:20 +00001485 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001486
Guido van Rossume2a383d2007-01-15 16:59:06 +00001487 for obj in (123, 123.45, 123):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001488 self.assertEqual(str(obj), str(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001489
Walter Dörwald28256f22003-01-19 16:59:20 +00001490 # unicode(obj, encoding, error) tests (this maps to
1491 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001492
Walter Dörwald28256f22003-01-19 16:59:20 +00001493 if not sys.platform.startswith('java'):
1494 self.assertRaises(
1495 TypeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001496 str,
1497 'decoding unicode is not supported',
Walter Dörwald28256f22003-01-19 16:59:20 +00001498 'utf-8',
1499 'strict'
1500 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001501
Walter Dörwald28256f22003-01-19 16:59:20 +00001502 self.assertEqual(
Walter Dörwald67e83882007-05-05 12:26:27 +00001503 str(b'strings are decoded to unicode', 'utf-8', 'strict'),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001504 'strings are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001505 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001506
Walter Dörwald28256f22003-01-19 16:59:20 +00001507 if not sys.platform.startswith('java'):
1508 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001509 str(
Guido van Rossumbae07c92007-10-08 02:46:15 +00001510 memoryview(b'character buffers are decoded to unicode'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001511 'utf-8',
1512 'strict'
1513 ),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001514 'character buffers are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001515 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001516
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001517 self.assertRaises(TypeError, str, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001518
Chris Jerdonek5fae0e52012-11-20 17:45:51 -08001519 def test_constructor_keyword_args(self):
1520 """Pass various keyword argument combinations to the constructor."""
1521 # The object argument can be passed as a keyword.
1522 self.assertEqual(str(object='foo'), 'foo')
1523 self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo')
1524 # The errors argument without encoding triggers "decode" mode.
1525 self.assertEqual(str(b'foo', errors='strict'), 'foo') # not "b'foo'"
1526 self.assertEqual(str(object=b'foo', errors='strict'), 'foo')
1527
1528 def test_constructor_defaults(self):
1529 """Check the constructor argument defaults."""
1530 # The object argument defaults to '' or b''.
1531 self.assertEqual(str(), '')
1532 self.assertEqual(str(errors='strict'), '')
1533 utf8_cent = '¢'.encode('utf-8')
1534 # The encoding argument defaults to utf-8.
1535 self.assertEqual(str(utf8_cent, errors='strict'), '¢')
1536 # The errors argument defaults to strict.
1537 self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii')
1538
Walter Dörwald28256f22003-01-19 16:59:20 +00001539 def test_codecs_utf7(self):
1540 utfTests = [
Walter Dörwald67e83882007-05-05 12:26:27 +00001541 ('A\u2262\u0391.', b'A+ImIDkQ.'), # RFC2152 example
1542 ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'), # RFC2152 example
1543 ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'), # RFC2152 example
1544 ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1545 ('+', b'+-'),
1546 ('+-', b'+--'),
1547 ('+?', b'+-?'),
1548 ('\?', b'+AFw?'),
1549 ('+?', b'+-?'),
1550 (r'\\?', b'+AFwAXA?'),
1551 (r'\\\?', b'+AFwAXABc?'),
Antoine Pitrou244651a2009-05-04 18:56:13 +00001552 (r'++--', b'+-+---'),
1553 ('\U000abcde', b'+2m/c3g-'), # surrogate pairs
1554 ('/', b'/'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001555 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001556
Walter Dörwald28256f22003-01-19 16:59:20 +00001557 for (x, y) in utfTests:
1558 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001559
Antoine Pitrou5418ee02011-11-15 01:42:21 +01001560 # Unpaired surrogates are passed through
1561 self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
1562 self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
1563 self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
1564 self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
1565 self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
1566 self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
1567 self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
1568 self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001569
Antoine Pitrou5418ee02011-11-15 01:42:21 +01001570 self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
1571 self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001572
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001573 # Issue #2242: crash on some Windows/MSVC versions
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001574 self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')
Antoine Pitrou244651a2009-05-04 18:56:13 +00001575
1576 # Direct encoded characters
1577 set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1578 # Optional direct characters
1579 set_o = '!"#$%&*;<=>@[]^_`{|}'
1580 for c in set_d:
1581 self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1582 self.assertEqual(c.encode('ascii').decode('utf7'), c)
1583 for c in set_o:
1584 self.assertEqual(c.encode('ascii').decode('utf7'), c)
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001585
Walter Dörwald28256f22003-01-19 16:59:20 +00001586 def test_codecs_utf8(self):
Walter Dörwald67e83882007-05-05 12:26:27 +00001587 self.assertEqual(''.encode('utf-8'), b'')
1588 self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
Ezio Melottia9860ae2011-10-04 19:06:00 +03001589 self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82')
1590 self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96')
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001591 self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1592 self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
Ezio Melottia9860ae2011-10-04 19:06:00 +03001593 self.assertEqual(('\U00010002'*10).encode('utf-8'),
1594 b'\xf0\x90\x80\x82'*10)
Walter Dörwald28256f22003-01-19 16:59:20 +00001595 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001596 '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1597 '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1598 '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1599 '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1600 '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1601 ' Nunstuck git und'.encode('utf-8'),
Walter Dörwald67e83882007-05-05 12:26:27 +00001602 b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1603 b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1604 b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1605 b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1606 b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1607 b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1608 b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1609 b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1610 b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1611 b'\xe3\x80\x8cWenn ist das Nunstuck git und'
Walter Dörwald28256f22003-01-19 16:59:20 +00001612 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001613
Walter Dörwald28256f22003-01-19 16:59:20 +00001614 # UTF-8 specific decoding tests
Walter Dörwald67e83882007-05-05 12:26:27 +00001615 self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1616 self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1617 self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001618
Walter Dörwald28256f22003-01-19 16:59:20 +00001619 # Other possible utf-8 test cases:
1620 # * strict decoding testing for all of the
1621 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001622
Ezio Melotti57221d02010-07-01 07:32:02 +00001623 def test_utf8_decode_valid_sequences(self):
1624 sequences = [
1625 # single byte
1626 (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1627 # 2 bytes
1628 (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1629 # 3 bytes
1630 (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1631 (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1632 # 4 bytes
1633 (b'\xF0\x90\x80\x80', '\U00010000'),
1634 (b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1635 ]
1636 for seq, res in sequences:
1637 self.assertEqual(seq.decode('utf-8'), res)
1638
1639
1640 def test_utf8_decode_invalid_sequences(self):
1641 # continuation bytes in a sequence of 2, 3, or 4 bytes
1642 continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001643 # start bytes of a 2-byte sequence equivalent to code points < 0x7F
Ezio Melotti57221d02010-07-01 07:32:02 +00001644 invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001645 # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
Ezio Melotti57221d02010-07-01 07:32:02 +00001646 invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1647 invalid_start_bytes = (
1648 continuation_bytes + invalid_2B_seq_start_bytes +
1649 invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1650 )
1651
1652 for byte in invalid_start_bytes:
1653 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1654
1655 for sb in invalid_2B_seq_start_bytes:
1656 for cb in continuation_bytes:
1657 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1658
1659 for sb in invalid_4B_seq_start_bytes:
1660 for cb1 in continuation_bytes[:3]:
1661 for cb3 in continuation_bytes[:3]:
1662 self.assertRaises(UnicodeDecodeError,
1663 (sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1664
1665 for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1666 self.assertRaises(UnicodeDecodeError,
1667 (b'\xE0'+cb+b'\x80').decode, 'utf-8')
1668 self.assertRaises(UnicodeDecodeError,
1669 (b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1670 # surrogates
1671 for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1672 self.assertRaises(UnicodeDecodeError,
1673 (b'\xED'+cb+b'\x80').decode, 'utf-8')
1674 self.assertRaises(UnicodeDecodeError,
1675 (b'\xED'+cb+b'\xBF').decode, 'utf-8')
1676 for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1677 self.assertRaises(UnicodeDecodeError,
1678 (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1679 self.assertRaises(UnicodeDecodeError,
1680 (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1681 for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1682 self.assertRaises(UnicodeDecodeError,
1683 (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1684 self.assertRaises(UnicodeDecodeError,
1685 (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1686
1687 def test_issue8271(self):
1688 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1689 # only the start byte and the continuation byte(s) are now considered
1690 # invalid, instead of the number of bytes specified by the start byte.
1691 # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1692 # table 3-8, Row 2) for more information about the algorithm used.
1693 FFFD = '\ufffd'
1694 sequences = [
1695 # invalid start bytes
1696 (b'\x80', FFFD), # continuation byte
1697 (b'\x80\x80', FFFD*2), # 2 continuation bytes
1698 (b'\xc0', FFFD),
1699 (b'\xc0\xc0', FFFD*2),
1700 (b'\xc1', FFFD),
1701 (b'\xc1\xc0', FFFD*2),
1702 (b'\xc0\xc1', FFFD*2),
1703 # with start byte of a 2-byte sequence
1704 (b'\xc2', FFFD), # only the start byte
1705 (b'\xc2\xc2', FFFD*2), # 2 start bytes
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001706 (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes
Ezio Melotti57221d02010-07-01 07:32:02 +00001707 (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1708 # with start byte of a 3-byte sequence
1709 (b'\xe1', FFFD), # only the start byte
1710 (b'\xe1\xe1', FFFD*2), # 2 start bytes
1711 (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1712 (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1713 (b'\xe1\x80', FFFD), # only 1 continuation byte
1714 (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1715 (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1716 (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1717 (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1718 (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1719 (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1720 # with start byte of a 4-byte sequence
1721 (b'\xf1', FFFD), # only the start byte
1722 (b'\xf1\xf1', FFFD*2), # 2 start bytes
1723 (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1724 (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1725 (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1726 (b'\xf1\x80', FFFD), # only 1 continuation bytes
1727 (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1728 (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1729 (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1730 (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1731 (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1732 (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1733 (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1734 (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1735 (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1736 (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1737 (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1738 (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1739 # with invalid start byte of a 4-byte sequence (rfc2279)
1740 (b'\xf5', FFFD), # only the start byte
1741 (b'\xf5\xf5', FFFD*2), # 2 start bytes
1742 (b'\xf5\x80', FFFD*2), # only 1 continuation byte
1743 (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1744 (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1745 (b'\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
1746 (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1747 (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1748 # with invalid start byte of a 5-byte sequence (rfc2279)
1749 (b'\xf8', FFFD), # only the start byte
1750 (b'\xf8\xf8', FFFD*2), # 2 start bytes
1751 (b'\xf8\x80', FFFD*2), # only one continuation byte
1752 (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1753 (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1754 # with invalid start byte of a 6-byte sequence (rfc2279)
1755 (b'\xfc', FFFD), # only the start byte
1756 (b'\xfc\xfc', FFFD*2), # 2 start bytes
1757 (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1758 (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1759 # invalid start byte
1760 (b'\xfe', FFFD),
1761 (b'\xfe\x80\x80', FFFD*3),
1762 # other sequences
1763 (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1764 (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1765 (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1766 (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1767 '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1768 ]
1769 for n, (seq, res) in enumerate(sequences):
1770 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1771 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1772 self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1773 self.assertEqual(seq.decode('utf-8', 'ignore'),
1774 res.replace('\uFFFD', ''))
1775
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001776 def to_bytestring(self, seq):
1777 return bytes(int(c, 16) for c in seq.split())
1778
1779 def assertCorrectUTF8Decoding(self, seq, res, err):
1780 """
1781 Check that an invalid UTF-8 sequence raises an UnicodeDecodeError when
1782 'strict' is used, returns res when 'replace' is used, and that doesn't
1783 return anything when 'ignore' is used.
1784 """
1785 with self.assertRaises(UnicodeDecodeError) as cm:
1786 seq.decode('utf-8')
1787 exc = cm.exception
1788
1789 self.assertIn(err, str(exc))
1790 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1791 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'),
1792 'aaaa' + res + 'bbbb')
1793 res = res.replace('\ufffd', '')
1794 self.assertEqual(seq.decode('utf-8', 'ignore'), res)
1795 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'),
1796 'aaaa' + res + 'bbbb')
1797
1798 def test_invalid_start_byte(self):
1799 """
1800 Test that an 'invalid start byte' error is raised when the first byte
1801 is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
1802 4-bytes sequence. The invalid start byte is replaced with a single
1803 U+FFFD when errors='replace'.
1804 E.g. <80> is a continuation byte and can appear only after a start byte.
1805 """
1806 FFFD = '\ufffd'
1807 for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
1808 self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd',
1809 'invalid start byte')
1810
1811 def test_unexpected_end_of_data(self):
1812 """
1813 Test that an 'unexpected end of data' error is raised when the string
1814 ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
1815 enough continuation bytes. The incomplete sequence is replaced with a
1816 single U+FFFD when errors='replace'.
1817 E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
1818 sequence, but it's followed by only 2 valid continuation bytes and the
1819 last continuation bytes is missing.
1820 Note: the continuation bytes must be all valid, if one of them is
1821 invalid another error will be raised.
1822 """
1823 sequences = [
1824 'C2', 'DF',
1825 'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
1826 'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
1827 'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
1828 'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
1829 'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
1830 'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
1831 ]
1832 FFFD = '\ufffd'
1833 for seq in sequences:
1834 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), '\ufffd',
1835 'unexpected end of data')
1836
1837 def test_invalid_cb_for_2bytes_seq(self):
1838 """
1839 Test that an 'invalid continuation byte' error is raised when the
1840 continuation byte of a 2-bytes sequence is invalid. The start byte
1841 is replaced by a single U+FFFD and the second byte is handled
1842 separately when errors='replace'.
1843 E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
1844 sequence, but 41 is not a valid continuation byte because it's the
1845 ASCII letter 'A'.
1846 """
1847 FFFD = '\ufffd'
1848 FFFDx2 = FFFD * 2
1849 sequences = [
1850 ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'),
1851 ('C2 C0', FFFDx2), ('C2 FF', FFFDx2),
1852 ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'),
1853 ('DF C0', FFFDx2), ('DF FF', FFFDx2),
1854 ]
1855 for seq, res in sequences:
1856 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
1857 'invalid continuation byte')
1858
1859 def test_invalid_cb_for_3bytes_seq(self):
1860 """
1861 Test that an 'invalid continuation byte' error is raised when the
1862 continuation byte(s) of a 3-bytes sequence are invalid. When
1863 errors='replace', if the first continuation byte is valid, the first
1864 two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
1865 third byte is handled separately, otherwise only the start byte is
1866 replaced with a U+FFFD and the other continuation bytes are handled
1867 separately.
1868 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1869 sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1870 because it's the ASCII letter 'A'.
1871 Note: when the start byte is E0 or ED, the valid ranges for the first
1872 continuation byte are limited to A0..BF and 80..9F respectively.
1873 Python 2 used to consider all the bytes in range 80..BF valid when the
1874 start byte was ED. This is fixed in Python 3.
1875 """
1876 FFFD = '\ufffd'
1877 FFFDx2 = FFFD * 2
1878 sequences = [
1879 ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2),
1880 ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
1881 ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'),
1882 ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
1883 ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'),
1884 ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'),
1885 ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
1886 ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'),
1887 ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
1888 ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'),
1889 ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'),
1890 ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
1891 ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'),
1892 ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
1893 ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'),
1894 ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'),
1895 ('ED 7F', FFFD+'\x7f'),
1896 ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
1897 ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'),
1898 ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2),
1899 ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'),
1900 ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2),
1901 ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'),
1902 ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
1903 ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'),
1904 ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
1905 ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'),
1906 ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'),
1907 ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
1908 ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'),
1909 ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
1910 ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'),
1911 ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
1912 ]
1913 for seq, res in sequences:
1914 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
1915 'invalid continuation byte')
1916
1917 def test_invalid_cb_for_4bytes_seq(self):
1918 """
1919 Test that an 'invalid continuation byte' error is raised when the
1920 continuation byte(s) of a 4-bytes sequence are invalid. When
1921 errors='replace',the start byte and all the following valid
1922 continuation bytes are replaced with a single U+FFFD, and all the bytes
1923 starting from the first invalid continuation bytes (included) are
1924 handled separately.
1925 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1926 sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1927 because it's the ASCII letter 'A'.
1928 Note: when the start byte is E0 or ED, the valid ranges for the first
1929 continuation byte are limited to A0..BF and 80..9F respectively.
1930 However, when the start byte is ED, Python 2 considers all the bytes
1931 in range 80..BF valid. This is fixed in Python 3.
1932 """
1933 FFFD = '\ufffd'
1934 FFFDx2 = FFFD * 2
1935 sequences = [
1936 ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2),
1937 ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
1938 ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'),
1939 ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
1940 ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'),
1941 ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
1942 ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'),
1943 ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
1944 ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'),
1945 ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
1946 ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'),
1947 ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
1948 ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'),
1949 ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
1950 ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2),
1951 ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'),
1952 ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2),
1953 ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'),
1954 ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2),
1955 ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'),
1956 ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2),
1957 ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'),
1958 ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2),
1959 ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'),
1960 ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2),
1961 ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'),
1962 ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2),
1963 ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'),
1964 ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
1965 ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'),
1966 ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
1967 ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'),
1968 ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
1969 ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'),
1970 ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
1971 ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'),
1972 ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
1973 ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'),
1974 ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
1975 ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'),
1976 ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
1977 ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2),
1978 ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
1979 ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'),
1980 ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
1981 ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'),
1982 ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
1983 ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'),
1984 ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
1985 ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'),
1986 ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
1987 ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'),
1988 ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
1989 ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'),
1990 ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
1991 ]
1992 for seq, res in sequences:
1993 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
1994 'invalid continuation byte')
1995
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00001996 def test_codecs_idna(self):
1997 # Test whether trailing dot is preserved
Walter Dörwald1324c6f2007-05-11 19:57:05 +00001998 self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00001999
Walter Dörwald28256f22003-01-19 16:59:20 +00002000 def test_codecs_errors(self):
2001 # Error handling (encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002002 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
2003 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
Walter Dörwald67e83882007-05-05 12:26:27 +00002004 self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
2005 self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
Benjamin Peterson308d6372009-09-18 21:42:35 +00002006 self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
2007 'Andr\202 x'.encode('ascii', errors='replace'))
2008 self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
2009 'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002010
Walter Dörwald28256f22003-01-19 16:59:20 +00002011 # Error handling (decoding)
Walter Dörwald67e83882007-05-05 12:26:27 +00002012 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
2013 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
2014 self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
2015 self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03002016 self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002017
Walter Dörwald28256f22003-01-19 16:59:20 +00002018 # Error handling (unknown character names)
Guido van Rossum39478e82007-08-27 17:23:59 +00002019 self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002020
Walter Dörwald28256f22003-01-19 16:59:20 +00002021 # Error handling (truncated escape sequence)
Guido van Rossum9c627722007-08-27 18:31:48 +00002022 self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00002023
Guido van Rossum9c627722007-08-27 18:31:48 +00002024 self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
2025 self.assertRaises(TypeError, str, b"hello", "test.unicode2")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002026 self.assertRaises(TypeError, "hello".encode, "test.unicode1")
2027 self.assertRaises(TypeError, "hello".encode, "test.unicode2")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00002028
Walter Dörwald28256f22003-01-19 16:59:20 +00002029 # Error handling (wrong arguments)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002030 self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002031
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00002032 # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00002033 self.assertRaises(UnicodeError, float, "\ud800")
2034 self.assertRaises(UnicodeError, float, "\udf00")
2035 self.assertRaises(UnicodeError, complex, "\ud800")
2036 self.assertRaises(UnicodeError, complex, "\udf00")
Guido van Rossum97064862000-04-10 13:52:48 +00002037
Walter Dörwald28256f22003-01-19 16:59:20 +00002038 def test_codecs(self):
2039 # Encoding
Walter Dörwald67e83882007-05-05 12:26:27 +00002040 self.assertEqual('hello'.encode('ascii'), b'hello')
2041 self.assertEqual('hello'.encode('utf-7'), b'hello')
2042 self.assertEqual('hello'.encode('utf-8'), b'hello')
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002043 self.assertEqual('hello'.encode('utf-8'), b'hello')
Walter Dörwald67e83882007-05-05 12:26:27 +00002044 self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
2045 self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
2046 self.assertEqual('hello'.encode('latin-1'), b'hello')
Guido van Rossum97064862000-04-10 13:52:48 +00002047
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002048 # Default encoding is utf-8
2049 self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
2050
Walter Dörwald28256f22003-01-19 16:59:20 +00002051 # Roundtrip safety for BMP (just the first 1024 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002052 for c in range(1024):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002053 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002054 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
2055 'utf-16-be', 'raw_unicode_escape',
2056 'unicode_escape', 'unicode_internal'):
Victor Stinner040e16e2011-11-15 22:44:05 +01002057 with warnings.catch_warnings():
2058 # unicode-internal has been deprecated
2059 warnings.simplefilter("ignore", DeprecationWarning)
2060
2061 self.assertEqual(str(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002062
Walter Dörwald28256f22003-01-19 16:59:20 +00002063 # Roundtrip safety for BMP (just the first 256 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002064 for c in range(256):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002065 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002066 for encoding in ('latin-1',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002067 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002068
Walter Dörwald28256f22003-01-19 16:59:20 +00002069 # Roundtrip safety for BMP (just the first 128 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002070 for c in range(128):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002071 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002072 for encoding in ('ascii',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002073 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002074
Walter Dörwald28256f22003-01-19 16:59:20 +00002075 # Roundtrip safety for non-BMP (just a few chars)
Victor Stinner040e16e2011-11-15 22:44:05 +01002076 with warnings.catch_warnings():
2077 # unicode-internal has been deprecated
2078 warnings.simplefilter("ignore", DeprecationWarning)
2079
2080 u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
2081 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
2082 'raw_unicode_escape',
2083 'unicode_escape', 'unicode_internal'):
2084 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002085
Antoine Pitrou51f66482011-11-11 13:35:44 +01002086 # UTF-8 must be roundtrip safe for all code points
2087 # (except surrogates, which are forbidden).
2088 u = ''.join(map(chr, list(range(0, 0xd800)) +
Ezio Melotti40dc9192011-11-11 17:00:46 +02002089 list(range(0xe000, 0x110000))))
Walter Dörwald28256f22003-01-19 16:59:20 +00002090 for encoding in ('utf-8',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002091 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002092
Walter Dörwald28256f22003-01-19 16:59:20 +00002093 def test_codecs_charmap(self):
2094 # 0-127
Guido van Rossum805365e2007-05-07 22:24:25 +00002095 s = bytes(range(128))
Walter Dörwald28256f22003-01-19 16:59:20 +00002096 for encoding in (
Andrew Kuchlingad8156e2013-11-10 13:44:30 -05002097 'cp037', 'cp1026', 'cp273',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00002098 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2099 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02002100 'cp863', 'cp865', 'cp866', 'cp1125',
Walter Dörwald28256f22003-01-19 16:59:20 +00002101 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2102 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03002103 'iso8859_7', 'iso8859_9',
2104 'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'latin_1',
Walter Dörwald28256f22003-01-19 16:59:20 +00002105 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002106
Walter Dörwald28256f22003-01-19 16:59:20 +00002107 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2108 'cp1256', 'cp1257', 'cp1258',
2109 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002110
Walter Dörwald28256f22003-01-19 16:59:20 +00002111 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2112 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +00002113
Walter Dörwald28256f22003-01-19 16:59:20 +00002114 ### These have undefined mappings:
2115 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +00002116
Walter Dörwald28256f22003-01-19 16:59:20 +00002117 ### These fail the round-trip:
2118 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +00002119
Walter Dörwald28256f22003-01-19 16:59:20 +00002120 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002121 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002122
Walter Dörwald28256f22003-01-19 16:59:20 +00002123 # 128-255
Guido van Rossum805365e2007-05-07 22:24:25 +00002124 s = bytes(range(128, 256))
Walter Dörwald28256f22003-01-19 16:59:20 +00002125 for encoding in (
Andrew Kuchlingad8156e2013-11-10 13:44:30 -05002126 'cp037', 'cp1026', 'cp273',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00002127 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2128 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02002129 'cp863', 'cp865', 'cp866', 'cp1125',
Walter Dörwald28256f22003-01-19 16:59:20 +00002130 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2131 'iso8859_2', 'iso8859_4', 'iso8859_5',
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03002132 'iso8859_9', 'koi8_r', 'koi8_u', 'latin_1',
Walter Dörwald28256f22003-01-19 16:59:20 +00002133 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +00002134
Walter Dörwald28256f22003-01-19 16:59:20 +00002135 ### These have undefined mappings:
2136 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2137 #'cp1256', 'cp1257', 'cp1258',
2138 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03002139 #'iso8859_3', 'iso8859_6', 'iso8859_7', 'koi8_t', 'kz1048',
Walter Dörwald28256f22003-01-19 16:59:20 +00002140 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +00002141
Walter Dörwald28256f22003-01-19 16:59:20 +00002142 ### These fail the round-trip:
2143 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +00002144
Walter Dörwald28256f22003-01-19 16:59:20 +00002145 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002146 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002147
Walter Dörwald28256f22003-01-19 16:59:20 +00002148 def test_concatenation(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002149 self.assertEqual(("abc" "def"), "abcdef")
2150 self.assertEqual(("abc" "def"), "abcdef")
2151 self.assertEqual(("abc" "def"), "abcdef")
2152 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2153 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +00002154
Walter Dörwald28256f22003-01-19 16:59:20 +00002155 def test_printing(self):
2156 class BitBucket:
2157 def write(self, text):
2158 pass
Fred Drake004d5e62000-10-23 17:22:08 +00002159
Walter Dörwald28256f22003-01-19 16:59:20 +00002160 out = BitBucket()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002161 print('abc', file=out)
2162 print('abc', 'def', file=out)
2163 print('abc', 'def', file=out)
2164 print('abc', 'def', file=out)
2165 print('abc\n', file=out)
2166 print('abc\n', end=' ', file=out)
2167 print('abc\n', end=' ', file=out)
2168 print('def\n', file=out)
2169 print('def\n', file=out)
Fred Drake004d5e62000-10-23 17:22:08 +00002170
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002171 def test_ucs4(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002172 x = '\U00100000'
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002173 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
2174 self.assertEqual(x, y)
2175
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002176 y = br'\U00100000'
2177 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2178 self.assertEqual(x, y)
2179 y = br'\U00010000'
2180 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2181 self.assertEqual(x, y)
Christian Heimesfe337bf2008-03-23 21:54:12 +00002182
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002183 try:
2184 br'\U11111111'.decode("raw-unicode-escape")
2185 except UnicodeDecodeError as e:
2186 self.assertEqual(e.start, 0)
2187 self.assertEqual(e.end, 10)
2188 else:
2189 self.fail("Should have raised UnicodeDecodeError")
Christian Heimesfe337bf2008-03-23 21:54:12 +00002190
Brett Cannonc3647ac2005-04-26 03:45:26 +00002191 def test_conversion(self):
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002192 # Make sure __str__() works properly
2193 class ObjectToStr:
Brett Cannonc3647ac2005-04-26 03:45:26 +00002194 def __str__(self):
2195 return "foo"
2196
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002197 class StrSubclassToStr(str):
Guido van Rossum98297ee2007-11-06 21:34:58 +00002198 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002199 return "foo"
Brett Cannonc3647ac2005-04-26 03:45:26 +00002200
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002201 class StrSubclassToStrSubclass(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002202 def __new__(cls, content=""):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002203 return str.__new__(cls, 2*content)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002204 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002205 return self
2206
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002207 self.assertEqual(str(ObjectToStr()), "foo")
2208 self.assertEqual(str(StrSubclassToStr("bar")), "foo")
2209 s = str(StrSubclassToStrSubclass("foo"))
2210 self.assertEqual(s, "foofoo")
2211 self.assertIs(type(s), StrSubclassToStrSubclass)
Serhiy Storchaka15095802015-11-25 15:47:01 +02002212 s = StrSubclass(StrSubclassToStrSubclass("foo"))
2213 self.assertEqual(s, "foofoo")
2214 self.assertIs(type(s), StrSubclass)
Brett Cannonc3647ac2005-04-26 03:45:26 +00002215
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002216 def test_unicode_repr(self):
2217 class s1:
2218 def __repr__(self):
2219 return '\\n'
2220
2221 class s2:
2222 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002223 return '\\n'
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002224
2225 self.assertEqual(repr(s1()), '\\n')
2226 self.assertEqual(repr(s2()), '\\n')
2227
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002228 def test_printable_repr(self):
2229 self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
Martin v. Löwisbaecd722010-10-11 22:42:28 +00002230 self.assertEqual(repr('\U00014000'), "'\\U00014000'") # nonprintable
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002231
Zachary Ware9fe6d862013-12-08 00:20:35 -06002232 # This test only affects 32-bit platforms because expandtabs can only take
2233 # an int as the max value, not a 64-bit C long. If expandtabs is changed
2234 # to take a 64-bit long, this test should apply to all platforms.
2235 @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4,
2236 'only applies to 32-bit platforms')
Guido van Rossumcd16bf62007-06-13 18:07:49 +00002237 def test_expandtabs_overflows_gracefully(self):
Christian Heimesa37d4c62007-12-04 23:02:19 +00002238 self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002239
Victor Stinner1d972ad2011-10-07 13:31:46 +02002240 @support.cpython_only
Antoine Pitroue19aa382011-10-04 16:04:01 +02002241 def test_expandtabs_optimization(self):
2242 s = 'abc'
2243 self.assertIs(s.expandtabs(), s)
2244
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00002245 def test_raiseMemError(self):
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 if struct.calcsize('P') == 8:
2247 # 64 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02002248 ascii_struct_size = 48
2249 compact_struct_size = 72
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002250 else:
2251 # 32 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02002252 ascii_struct_size = 24
2253 compact_struct_size = 36
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254
2255 for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
2256 code = ord(char)
2257 if code < 0x100:
2258 char_size = 1 # sizeof(Py_UCS1)
2259 struct_size = ascii_struct_size
2260 elif code < 0x10000:
2261 char_size = 2 # sizeof(Py_UCS2)
2262 struct_size = compact_struct_size
2263 else:
2264 char_size = 4 # sizeof(Py_UCS4)
2265 struct_size = compact_struct_size
2266 # Note: sys.maxsize is half of the actual max allocation because of
Martin v. Löwis287eca62011-09-28 10:03:28 +02002267 # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
2268 # be allocatable, given enough memory.
2269 maxlen = ((sys.maxsize - struct_size) // char_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002270 alloc = lambda: char * maxlen
2271 self.assertRaises(MemoryError, alloc)
2272 self.assertRaises(MemoryError, alloc)
Antoine Pitrou3db3e872008-08-17 17:06:51 +00002273
Victor Stinner808fc0a2010-03-22 12:50:40 +00002274 def test_format_subclass(self):
2275 class S(str):
2276 def __str__(self):
2277 return '__str__ overridden'
2278 s = S('xxx')
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002279 self.assertEqual("%s" % s, '__str__ overridden')
2280 self.assertEqual("{}".format(s), '__str__ overridden')
Victor Stinner808fc0a2010-03-22 12:50:40 +00002281
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002282 # Test PyUnicode_FromFormat()
Victor Stinner1205f272010-09-11 00:54:47 +00002283 def test_from_format(self):
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002284 support.import_module('ctypes')
Victor Stinner15a11362012-10-06 23:48:20 +02002285 from ctypes import (
2286 pythonapi, py_object, sizeof,
Victor Stinner6d970f42011-03-02 00:04:25 +00002287 c_int, c_long, c_longlong, c_ssize_t,
Victor Stinner15a11362012-10-06 23:48:20 +02002288 c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002289 name = "PyUnicode_FromFormat"
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002290 _PyUnicode_FromFormat = getattr(pythonapi, name)
2291 _PyUnicode_FromFormat.restype = py_object
2292
2293 def PyUnicode_FromFormat(format, *args):
2294 cargs = tuple(
2295 py_object(arg) if isinstance(arg, str) else arg
2296 for arg in args)
2297 return _PyUnicode_FromFormat(format, *cargs)
Victor Stinner1205f272010-09-11 00:54:47 +00002298
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002299 def check_format(expected, format, *args):
2300 text = PyUnicode_FromFormat(format, *args)
2301 self.assertEqual(expected, text)
2302
Victor Stinner1205f272010-09-11 00:54:47 +00002303 # ascii format, non-ascii argument
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002304 check_format('ascii\x7f=unicode\xe9',
2305 b'ascii\x7f=%U', 'unicode\xe9')
Victor Stinner1205f272010-09-11 00:54:47 +00002306
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002307 # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
2308 # raises an error
Ezio Melottied3a7d22010-12-01 02:32:32 +00002309 self.assertRaisesRegex(ValueError,
Victor Stinner1205f272010-09-11 00:54:47 +00002310 '^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
Victor Stinner4c7db312010-09-12 07:51:18 +00002311 'string, got a non-ASCII byte: 0xe9$',
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002312 PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00002313
Victor Stinner96865452011-03-01 23:44:09 +00002314 # test "%c"
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002315 check_format('\uabcd',
2316 b'%c', c_int(0xabcd))
2317 check_format('\U0010ffff',
2318 b'%c', c_int(0x10ffff))
Serhiy Storchaka8eeae212013-06-23 20:12:14 +03002319 with self.assertRaises(OverflowError):
2320 PyUnicode_FromFormat(b'%c', c_int(0x110000))
Serhiy Storchaka31b1c8b2013-06-12 09:20:44 +03002321 # Issue #18183
Serhiy Storchakaf15ffe02013-06-12 09:28:20 +03002322 check_format('\U00010000\U00100000',
2323 b'%c%c', c_int(0x10000), c_int(0x100000))
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002324
Victor Stinner96865452011-03-01 23:44:09 +00002325 # test "%"
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002326 check_format('%',
2327 b'%')
2328 check_format('%',
2329 b'%%')
2330 check_format('%s',
2331 b'%%s')
2332 check_format('[%]',
2333 b'[%%]')
2334 check_format('%abc',
2335 b'%%%s', b'abc')
2336
2337 # truncated string
2338 check_format('abc',
2339 b'%.3s', b'abcdef')
2340 check_format('abc[\ufffd',
2341 b'%.5s', 'abc[\u20ac]'.encode('utf8'))
2342 check_format("'\\u20acABC'",
2343 b'%A', '\u20acABC')
2344 check_format("'\\u20",
2345 b'%.5A', '\u20acABCDEF')
2346 check_format("'\u20acABC'",
2347 b'%R', '\u20acABC')
2348 check_format("'\u20acA",
2349 b'%.3R', '\u20acABCDEF')
2350 check_format('\u20acAB',
2351 b'%.3S', '\u20acABCDEF')
2352 check_format('\u20acAB',
2353 b'%.3U', '\u20acABCDEF')
2354 check_format('\u20acAB',
2355 b'%.3V', '\u20acABCDEF', None)
2356 check_format('abc[\ufffd',
2357 b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
2358
2359 # following tests comes from #7330
2360 # test width modifier and precision modifier with %S
2361 check_format("repr= abc",
2362 b'repr=%5S', 'abc')
2363 check_format("repr=ab",
2364 b'repr=%.2S', 'abc')
2365 check_format("repr= ab",
2366 b'repr=%5.2S', 'abc')
2367
2368 # test width modifier and precision modifier with %R
2369 check_format("repr= 'abc'",
2370 b'repr=%8R', 'abc')
2371 check_format("repr='ab",
2372 b'repr=%.3R', 'abc')
2373 check_format("repr= 'ab",
2374 b'repr=%5.3R', 'abc')
2375
2376 # test width modifier and precision modifier with %A
2377 check_format("repr= 'abc'",
2378 b'repr=%8A', 'abc')
2379 check_format("repr='ab",
2380 b'repr=%.3A', 'abc')
2381 check_format("repr= 'ab",
2382 b'repr=%5.3A', 'abc')
2383
2384 # test width modifier and precision modifier with %s
2385 check_format("repr= abc",
2386 b'repr=%5s', b'abc')
2387 check_format("repr=ab",
2388 b'repr=%.2s', b'abc')
2389 check_format("repr= ab",
2390 b'repr=%5.2s', b'abc')
2391
2392 # test width modifier and precision modifier with %U
2393 check_format("repr= abc",
2394 b'repr=%5U', 'abc')
2395 check_format("repr=ab",
2396 b'repr=%.2U', 'abc')
2397 check_format("repr= ab",
2398 b'repr=%5.2U', 'abc')
2399
2400 # test width modifier and precision modifier with %V
2401 check_format("repr= abc",
2402 b'repr=%5V', 'abc', b'123')
2403 check_format("repr=ab",
2404 b'repr=%.2V', 'abc', b'123')
2405 check_format("repr= ab",
2406 b'repr=%5.2V', 'abc', b'123')
2407 check_format("repr= 123",
2408 b'repr=%5V', None, b'123')
2409 check_format("repr=12",
2410 b'repr=%.2V', None, b'123')
2411 check_format("repr= 12",
2412 b'repr=%5.2V', None, b'123')
Victor Stinner96865452011-03-01 23:44:09 +00002413
Victor Stinner6d970f42011-03-02 00:04:25 +00002414 # test integer formats (%i, %d, %u)
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002415 check_format('010',
2416 b'%03i', c_int(10))
2417 check_format('0010',
2418 b'%0.4i', c_int(10))
2419 check_format('-123',
2420 b'%i', c_int(-123))
2421 check_format('-123',
2422 b'%li', c_long(-123))
2423 check_format('-123',
2424 b'%lli', c_longlong(-123))
2425 check_format('-123',
2426 b'%zi', c_ssize_t(-123))
Victor Stinner96865452011-03-01 23:44:09 +00002427
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002428 check_format('-123',
2429 b'%d', c_int(-123))
2430 check_format('-123',
2431 b'%ld', c_long(-123))
2432 check_format('-123',
2433 b'%lld', c_longlong(-123))
2434 check_format('-123',
2435 b'%zd', c_ssize_t(-123))
Victor Stinner96865452011-03-01 23:44:09 +00002436
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002437 check_format('123',
2438 b'%u', c_uint(123))
2439 check_format('123',
2440 b'%lu', c_ulong(123))
2441 check_format('123',
2442 b'%llu', c_ulonglong(123))
2443 check_format('123',
2444 b'%zu', c_size_t(123))
Victor Stinner6d970f42011-03-02 00:04:25 +00002445
Victor Stinner15a11362012-10-06 23:48:20 +02002446 # test long output
2447 min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
2448 max_longlong = -min_longlong - 1
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002449 check_format(str(min_longlong),
2450 b'%lld', c_longlong(min_longlong))
2451 check_format(str(max_longlong),
2452 b'%lld', c_longlong(max_longlong))
Victor Stinner15a11362012-10-06 23:48:20 +02002453 max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002454 check_format(str(max_ulonglong),
2455 b'%llu', c_ulonglong(max_ulonglong))
Victor Stinner15a11362012-10-06 23:48:20 +02002456 PyUnicode_FromFormat(b'%p', c_void_p(-1))
2457
Victor Stinnere215d962012-10-06 23:03:36 +02002458 # test padding (width and/or precision)
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002459 check_format('123'.rjust(10, '0'),
2460 b'%010i', c_int(123))
2461 check_format('123'.rjust(100),
2462 b'%100i', c_int(123))
2463 check_format('123'.rjust(100, '0'),
2464 b'%.100i', c_int(123))
2465 check_format('123'.rjust(80, '0').rjust(100),
2466 b'%100.80i', c_int(123))
Victor Stinnere215d962012-10-06 23:03:36 +02002467
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002468 check_format('123'.rjust(10, '0'),
2469 b'%010u', c_uint(123))
2470 check_format('123'.rjust(100),
2471 b'%100u', c_uint(123))
2472 check_format('123'.rjust(100, '0'),
2473 b'%.100u', c_uint(123))
2474 check_format('123'.rjust(80, '0').rjust(100),
2475 b'%100.80u', c_uint(123))
Victor Stinnere215d962012-10-06 23:03:36 +02002476
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002477 check_format('123'.rjust(10, '0'),
2478 b'%010x', c_int(0x123))
2479 check_format('123'.rjust(100),
2480 b'%100x', c_int(0x123))
2481 check_format('123'.rjust(100, '0'),
2482 b'%.100x', c_int(0x123))
2483 check_format('123'.rjust(80, '0').rjust(100),
2484 b'%100.80x', c_int(0x123))
Victor Stinnere215d962012-10-06 23:03:36 +02002485
Victor Stinner6d970f42011-03-02 00:04:25 +00002486 # test %A
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002487 check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
2488 b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
Victor Stinner9a909002010-10-18 20:59:24 +00002489
Victor Stinner6d970f42011-03-02 00:04:25 +00002490 # test %V
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002491 check_format('repr=abc',
2492 b'repr=%V', 'abc', b'xyz')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002493
2494 # Test string decode from parameter of %s using utf-8.
2495 # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
2496 # '\u4eba\u6c11'
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002497 check_format('repr=\u4eba\u6c11',
2498 b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002499
2500 #Test replace error handler.
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002501 check_format('repr=abc\ufffd',
2502 b'repr=%V', None, b'abc\xff')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002503
Victor Stinner6d970f42011-03-02 00:04:25 +00002504 # not supported: copy the raw format string. these tests are just here
2505 # to check for crashs and should not be considered as specifications
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002506 check_format('%s',
2507 b'%1%s', b'abc')
2508 check_format('%1abc',
2509 b'%1abc')
2510 check_format('%+i',
2511 b'%+i', c_int(10))
2512 check_format('%.%s',
2513 b'%.%s', b'abc')
Victor Stinner6d970f42011-03-02 00:04:25 +00002514
Victor Stinner1c24bd02010-10-02 11:03:13 +00002515 # Test PyUnicode_AsWideChar()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002516 @support.cpython_only
Victor Stinner1c24bd02010-10-02 11:03:13 +00002517 def test_aswidechar(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002518 from _testcapi import unicode_aswidechar
Antoine Pitrou0662bc22010-11-22 16:19:04 +00002519 support.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002520 from ctypes import c_wchar, sizeof
2521
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002522 wchar, size = unicode_aswidechar('abcdef', 2)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002523 self.assertEqual(size, 2)
2524 self.assertEqual(wchar, 'ab')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002525
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002526 wchar, size = unicode_aswidechar('abc', 3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002527 self.assertEqual(size, 3)
2528 self.assertEqual(wchar, 'abc')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002529
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002530 wchar, size = unicode_aswidechar('abc', 4)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002531 self.assertEqual(size, 3)
2532 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002533
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002534 wchar, size = unicode_aswidechar('abc', 10)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002535 self.assertEqual(size, 3)
2536 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002537
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002538 wchar, size = unicode_aswidechar('abc\0def', 20)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002539 self.assertEqual(size, 7)
2540 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002541
Victor Stinner5593d8a2010-10-02 11:11:27 +00002542 nonbmp = chr(0x10ffff)
2543 if sizeof(c_wchar) == 2:
2544 buflen = 3
2545 nchar = 2
2546 else: # sizeof(c_wchar) == 4
2547 buflen = 2
2548 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002549 wchar, size = unicode_aswidechar(nonbmp, buflen)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002550 self.assertEqual(size, nchar)
2551 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00002552
Victor Stinner1c24bd02010-10-02 11:03:13 +00002553 # Test PyUnicode_AsWideCharString()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002554 @support.cpython_only
Victor Stinner1c24bd02010-10-02 11:03:13 +00002555 def test_aswidecharstring(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002556 from _testcapi import unicode_aswidecharstring
Antoine Pitrou0662bc22010-11-22 16:19:04 +00002557 support.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002558 from ctypes import c_wchar, sizeof
2559
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002560 wchar, size = unicode_aswidecharstring('abc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00002561 self.assertEqual(size, 3)
2562 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002563
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002564 wchar, size = unicode_aswidecharstring('abc\0def')
Ezio Melottib3aedd42010-11-20 19:04:17 +00002565 self.assertEqual(size, 7)
2566 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002567
Victor Stinner5593d8a2010-10-02 11:11:27 +00002568 nonbmp = chr(0x10ffff)
2569 if sizeof(c_wchar) == 2:
2570 nchar = 2
2571 else: # sizeof(c_wchar) == 4
2572 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002573 wchar, size = unicode_aswidecharstring(nonbmp)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002574 self.assertEqual(size, nchar)
2575 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00002576
Benjamin Peterson811c2f12011-09-30 21:31:21 -04002577 def test_subclass_add(self):
2578 class S(str):
2579 def __add__(self, o):
2580 return "3"
2581 self.assertEqual(S("4") + S("5"), "3")
2582 class S(str):
2583 def __iadd__(self, o):
2584 return "3"
2585 s = S("1")
2586 s += "4"
2587 self.assertEqual(s, "3")
2588
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002589 @support.cpython_only
Victor Stinner42bf7752011-11-21 22:52:58 +01002590 def test_encode_decimal(self):
2591 from _testcapi import unicode_encodedecimal
2592 self.assertEqual(unicode_encodedecimal('123'),
2593 b'123')
2594 self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
2595 b'3.14')
2596 self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2597 b' 3.14 ')
2598 self.assertRaises(UnicodeEncodeError,
2599 unicode_encodedecimal, "123\u20ac", "strict")
Victor Stinner6345be92011-11-25 20:09:01 +01002600 self.assertRaisesRegex(
2601 ValueError,
2602 "^'decimal' codec can't encode character",
2603 unicode_encodedecimal, "123\u20ac", "replace")
Victor Stinner42bf7752011-11-21 22:52:58 +01002604
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002605 @support.cpython_only
Victor Stinner42bf7752011-11-21 22:52:58 +01002606 def test_transform_decimal(self):
2607 from _testcapi import unicode_transformdecimaltoascii as transform_decimal
2608 self.assertEqual(transform_decimal('123'),
2609 '123')
2610 self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
2611 '3.14')
2612 self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2613 "\N{EM SPACE}3.14\N{EN SPACE}")
2614 self.assertEqual(transform_decimal('123\u20ac'),
2615 '123\u20ac')
2616
Victor Stinnerc814a382011-11-22 01:06:15 +01002617 def test_getnewargs(self):
2618 text = 'abc'
2619 args = text.__getnewargs__()
2620 self.assertIsNot(args[0], text)
2621 self.assertEqual(args[0], text)
2622 self.assertEqual(len(args), 1)
2623
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01002624 def test_resize(self):
2625 for length in range(1, 100, 7):
2626 # generate a fresh string (refcount=1)
2627 text = 'a' * length + 'b'
2628
Ezio Melotti51e243f2013-02-20 23:56:01 +02002629 with support.check_warnings(('unicode_internal codec has been '
2630 'deprecated', DeprecationWarning)):
2631 # fill wstr internal field
2632 abc = text.encode('unicode_internal')
2633 self.assertEqual(abc.decode('unicode_internal'), text)
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01002634
Ezio Melotti51e243f2013-02-20 23:56:01 +02002635 # resize text: wstr field must be cleared and then recomputed
2636 text += 'c'
2637 abcdef = text.encode('unicode_internal')
2638 self.assertNotEqual(abc, abcdef)
2639 self.assertEqual(abcdef.decode('unicode_internal'), text)
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01002640
Victor Stinner9fc59812013-04-08 22:34:43 +02002641 def test_compare(self):
2642 # Issue #17615
2643 N = 10
2644 ascii = 'a' * N
2645 ascii2 = 'z' * N
2646 latin = '\x80' * N
2647 latin2 = '\xff' * N
2648 bmp = '\u0100' * N
2649 bmp2 = '\uffff' * N
2650 astral = '\U00100000' * N
2651 astral2 = '\U0010ffff' * N
2652 strings = (
2653 ascii, ascii2,
2654 latin, latin2,
2655 bmp, bmp2,
2656 astral, astral2)
2657 for text1, text2 in itertools.combinations(strings, 2):
2658 equal = (text1 is text2)
2659 self.assertEqual(text1 == text2, equal)
2660 self.assertEqual(text1 != text2, not equal)
2661
2662 if equal:
2663 self.assertTrue(text1 <= text2)
2664 self.assertTrue(text1 >= text2)
2665
2666 # text1 is text2: duplicate strings to skip the "str1 == str2"
2667 # optimization in unicode_compare_eq() and really compare
2668 # character per character
2669 copy1 = duplicate_string(text1)
2670 copy2 = duplicate_string(text2)
2671 self.assertIsNot(copy1, copy2)
2672
2673 self.assertTrue(copy1 == copy2)
2674 self.assertFalse(copy1 != copy2)
2675
2676 self.assertTrue(copy1 <= copy2)
2677 self.assertTrue(copy2 >= copy2)
2678
2679 self.assertTrue(ascii < ascii2)
2680 self.assertTrue(ascii < latin)
2681 self.assertTrue(ascii < bmp)
2682 self.assertTrue(ascii < astral)
2683 self.assertFalse(ascii >= ascii2)
2684 self.assertFalse(ascii >= latin)
2685 self.assertFalse(ascii >= bmp)
2686 self.assertFalse(ascii >= astral)
2687
2688 self.assertFalse(latin < ascii)
2689 self.assertTrue(latin < latin2)
2690 self.assertTrue(latin < bmp)
2691 self.assertTrue(latin < astral)
2692 self.assertTrue(latin >= ascii)
2693 self.assertFalse(latin >= latin2)
2694 self.assertFalse(latin >= bmp)
2695 self.assertFalse(latin >= astral)
2696
2697 self.assertFalse(bmp < ascii)
2698 self.assertFalse(bmp < latin)
2699 self.assertTrue(bmp < bmp2)
2700 self.assertTrue(bmp < astral)
2701 self.assertTrue(bmp >= ascii)
2702 self.assertTrue(bmp >= latin)
2703 self.assertFalse(bmp >= bmp2)
2704 self.assertFalse(bmp >= astral)
2705
2706 self.assertFalse(astral < ascii)
2707 self.assertFalse(astral < latin)
2708 self.assertFalse(astral < bmp2)
2709 self.assertTrue(astral < astral2)
2710 self.assertTrue(astral >= ascii)
2711 self.assertTrue(astral >= latin)
2712 self.assertTrue(astral >= bmp2)
2713 self.assertFalse(astral >= astral2)
2714
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02002715 @support.cpython_only
2716 def test_pep393_utf8_caching_bug(self):
2717 # Issue #25709: Problem with string concatenation and utf-8 cache
2718 from _testcapi import getargs_s_hash
2719 for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
2720 s = ''
2721 for i in range(5):
2722 # Due to CPython specific optimization the 's' string can be
2723 # resized in-place.
2724 s += chr(k)
2725 # Parsing with the "s#" format code calls indirectly
2726 # PyUnicode_AsUTF8AndSize() which creates the UTF-8
2727 # encoded string cached in the Unicode object.
2728 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2729 # Check that the second call returns the same result
2730 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2731
Victor Stinner1c24bd02010-10-02 11:03:13 +00002732
Eric Smitha1eac722011-01-29 11:15:35 +00002733class StringModuleTest(unittest.TestCase):
2734 def test_formatter_parser(self):
2735 def parse(format):
2736 return list(_string.formatter_parser(format))
2737
2738 formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
2739 self.assertEqual(formatter, [
2740 ('prefix ', '2', '', 's'),
2741 ('xxx', '0', '^+10.3f', None),
2742 ('', 'obj.attr', '', 's'),
2743 (' ', 'z[0]', '10', 's'),
2744 ])
2745
2746 formatter = parse("prefix {} suffix")
2747 self.assertEqual(formatter, [
2748 ('prefix ', '', '', None),
2749 (' suffix', None, None, None),
2750 ])
2751
2752 formatter = parse("str")
2753 self.assertEqual(formatter, [
2754 ('str', None, None, None),
2755 ])
2756
2757 formatter = parse("")
2758 self.assertEqual(formatter, [])
2759
2760 formatter = parse("{0}")
2761 self.assertEqual(formatter, [
2762 ('', '0', '', None),
2763 ])
2764
2765 self.assertRaises(TypeError, _string.formatter_parser, 1)
2766
2767 def test_formatter_field_name_split(self):
2768 def split(name):
2769 items = list(_string.formatter_field_name_split(name))
2770 items[1] = list(items[1])
2771 return items
2772 self.assertEqual(split("obj"), ["obj", []])
2773 self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
2774 self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
2775 self.assertEqual(split("obj.arg[key1][key2]"), [
2776 "obj",
2777 [(True, 'arg'),
2778 (False, 'key1'),
2779 (False, 'key2'),
2780 ]])
2781 self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
2782
2783
Walter Dörwald28256f22003-01-19 16:59:20 +00002784if __name__ == "__main__":
Ezio Melotti0dceb562013-01-10 07:43:26 +02002785 unittest.main()