blob: 4e4f7a91eea507dc09bf02039c7f565a9aab2952 [file] [log] [blame]
Guido van Rossuma831cac2000-03-10 23:23:21 +00001""" Test script for the Unicode implementation.
2
Guido van Rossuma831cac2000-03-10 23:23:21 +00003Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
Marc-André Lemburg36619082001-01-17 19:11:13 +00007"""#"
Victor Stinner040e16e2011-11-15 22:44:05 +01008import _string
Guido van Rossum98297ee2007-11-06 21:34:58 +00009import codecs
Victor Stinner9fc59812013-04-08 22:34:43 +020010import itertools
Ethan Furman9ab74802014-03-21 06:38:46 -070011import operator
Guido van Rossum98297ee2007-11-06 21:34:58 +000012import struct
13import sys
14import unittest
15import warnings
Benjamin Petersonee8712c2008-05-20 21:35:26 +000016from test import support, string_tests
Guido van Rossuma831cac2000-03-10 23:23:21 +000017
Neal Norwitz430f68b2005-11-24 22:00:56 +000018# Error handling (bad decoder return)
19def search_function(encoding):
20 def decode1(input, errors="strict"):
21 return 42 # not a tuple
22 def encode1(input, errors="strict"):
23 return 42 # not a tuple
24 def encode2(input, errors="strict"):
25 return (42, 42) # no unicode
26 def decode2(input, errors="strict"):
27 return (42, 42) # no unicode
28 if encoding=="test.unicode1":
29 return (encode1, decode1, None, None)
30 elif encoding=="test.unicode2":
31 return (encode2, decode2, None, None)
32 else:
33 return None
34codecs.register(search_function)
35
Victor Stinner9fc59812013-04-08 22:34:43 +020036def duplicate_string(text):
37 """
38 Try to get a fresh clone of the specified text:
39 new object with a reference count of 1.
40
41 This is a best-effort: latin1 single letters and the empty
42 string ('') are singletons and cannot be cloned.
43 """
44 return text.encode().decode()
45
Brett Cannon226b2302010-03-20 22:22:22 +000046class UnicodeTest(string_tests.CommonTest,
47 string_tests.MixinStrUnicodeUserStringTest,
Ezio Melotti0dceb562013-01-10 07:43:26 +020048 string_tests.MixinStrUnicodeTest,
49 unittest.TestCase):
Brett Cannon226b2302010-03-20 22:22:22 +000050
Guido van Rossumef87d6e2007-05-02 19:09:54 +000051 type2test = str
Walter Dörwald0fd583c2003-02-21 12:53:50 +000052
53 def checkequalnofix(self, result, object, methodname, *args):
54 method = getattr(object, methodname)
55 realresult = method(*args)
56 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000057 self.assertTrue(type(realresult) is type(result))
Walter Dörwald0fd583c2003-02-21 12:53:50 +000058
59 # if the original is returned make sure that
60 # this doesn't happen with subclasses
61 if realresult is object:
Guido van Rossumef87d6e2007-05-02 19:09:54 +000062 class usub(str):
Walter Dörwald0fd583c2003-02-21 12:53:50 +000063 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000064 return 'usub(%r)' % str.__repr__(self)
Walter Dörwald0fd583c2003-02-21 12:53:50 +000065 object = usub(object)
66 method = getattr(object, methodname)
67 realresult = method(*args)
68 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000069 self.assertTrue(object is not realresult)
Guido van Rossume4874ae2001-09-21 15:36:41 +000070
Jeremy Hylton504de6b2003-10-06 05:08:26 +000071 def test_literals(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000072 self.assertEqual('\xff', '\u00ff')
73 self.assertEqual('\uffff', '\U0000ffff')
Guido van Rossum36e0a922007-07-20 04:05:57 +000074 self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
75 self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
76 self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
Benjamin Petersoncd76c272008-04-05 15:09:30 +000077 # raw strings should not have unicode escapes
Florent Xiclunaa87b3832010-09-13 02:28:18 +000078 self.assertNotEqual(r"\u0020", " ")
Jeremy Hylton504de6b2003-10-06 05:08:26 +000079
Georg Brandl559e5d72008-06-11 18:37:52 +000080 def test_ascii(self):
81 if not sys.platform.startswith('java'):
82 # Test basic sanity of repr()
83 self.assertEqual(ascii('abc'), "'abc'")
84 self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
85 self.assertEqual(ascii('ab\\'), "'ab\\\\'")
86 self.assertEqual(ascii('\\c'), "'\\\\c'")
87 self.assertEqual(ascii('\\'), "'\\\\'")
88 self.assertEqual(ascii('\n'), "'\\n'")
89 self.assertEqual(ascii('\r'), "'\\r'")
90 self.assertEqual(ascii('\t'), "'\\t'")
91 self.assertEqual(ascii('\b'), "'\\x08'")
92 self.assertEqual(ascii("'\""), """'\\'"'""")
93 self.assertEqual(ascii("'\""), """'\\'"'""")
94 self.assertEqual(ascii("'"), '''"'"''')
95 self.assertEqual(ascii('"'), """'"'""")
96 latin1repr = (
97 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
98 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
99 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
100 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
101 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
102 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
103 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
104 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
105 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
106 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
107 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
108 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
109 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
110 "\\xfe\\xff'")
111 testrepr = ascii(''.join(map(chr, range(256))))
112 self.assertEqual(testrepr, latin1repr)
113 # Test ascii works on wide unicode escapes without overflow.
114 self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
115 ascii("\U00010000" * 39 + "\uffff" * 4096))
116
117 class WrongRepr:
118 def __repr__(self):
119 return b'byte-repr'
120 self.assertRaises(TypeError, ascii, WrongRepr())
121
Walter Dörwald28256f22003-01-19 16:59:20 +0000122 def test_repr(self):
123 if not sys.platform.startswith('java'):
124 # Test basic sanity of repr()
Walter Dörwald67e83882007-05-05 12:26:27 +0000125 self.assertEqual(repr('abc'), "'abc'")
126 self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
127 self.assertEqual(repr('ab\\'), "'ab\\\\'")
128 self.assertEqual(repr('\\c'), "'\\\\c'")
129 self.assertEqual(repr('\\'), "'\\\\'")
130 self.assertEqual(repr('\n'), "'\\n'")
131 self.assertEqual(repr('\r'), "'\\r'")
132 self.assertEqual(repr('\t'), "'\\t'")
133 self.assertEqual(repr('\b'), "'\\x08'")
134 self.assertEqual(repr("'\""), """'\\'"'""")
135 self.assertEqual(repr("'\""), """'\\'"'""")
136 self.assertEqual(repr("'"), '''"'"''')
137 self.assertEqual(repr('"'), """'"'""")
Walter Dörwald28256f22003-01-19 16:59:20 +0000138 latin1repr = (
Walter Dörwald67e83882007-05-05 12:26:27 +0000139 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
Walter Dörwald28256f22003-01-19 16:59:20 +0000140 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
141 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
142 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
143 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
144 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
Georg Brandl559e5d72008-06-11 18:37:52 +0000145 "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
146 "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
147 "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
148 "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
149 "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
150 "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
151 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
152 "\xfe\xff'")
Guido van Rossum805365e2007-05-07 22:24:25 +0000153 testrepr = repr(''.join(map(chr, range(256))))
Walter Dörwald28256f22003-01-19 16:59:20 +0000154 self.assertEqual(testrepr, latin1repr)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000155 # Test repr works on wide unicode escapes without overflow.
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000156 self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
157 repr("\U00010000" * 39 + "\uffff" * 4096))
Walter Dörwald28256f22003-01-19 16:59:20 +0000158
Georg Brandl559e5d72008-06-11 18:37:52 +0000159 class WrongRepr:
160 def __repr__(self):
161 return b'byte-repr'
162 self.assertRaises(TypeError, repr, WrongRepr())
163
Guido van Rossum49d6b072006-08-17 21:11:47 +0000164 def test_iterators(self):
165 # Make sure unicode objects have an __iter__ method
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000166 it = "\u1111\u2222\u3333".__iter__()
167 self.assertEqual(next(it), "\u1111")
168 self.assertEqual(next(it), "\u2222")
169 self.assertEqual(next(it), "\u3333")
Georg Brandla18af4e2007-04-21 15:47:16 +0000170 self.assertRaises(StopIteration, next, it)
Guido van Rossum49d6b072006-08-17 21:11:47 +0000171
Walter Dörwald28256f22003-01-19 16:59:20 +0000172 def test_count(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000173 string_tests.CommonTest.test_count(self)
174 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000175 self.checkequalnofix(3, 'aaa', 'count', 'a')
176 self.checkequalnofix(0, 'aaa', 'count', 'b')
177 self.checkequalnofix(3, 'aaa', 'count', 'a')
178 self.checkequalnofix(0, 'aaa', 'count', 'b')
179 self.checkequalnofix(0, 'aaa', 'count', 'b')
180 self.checkequalnofix(1, 'aaa', 'count', 'a', -1)
181 self.checkequalnofix(3, 'aaa', 'count', 'a', -10)
182 self.checkequalnofix(2, 'aaa', 'count', 'a', 0, -1)
183 self.checkequalnofix(0, 'aaa', 'count', 'a', 0, -10)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200184 # test mixed kinds
185 self.checkequal(10, '\u0102' + 'a' * 10, 'count', 'a')
186 self.checkequal(10, '\U00100304' + 'a' * 10, 'count', 'a')
187 self.checkequal(10, '\U00100304' + '\u0102' * 10, 'count', '\u0102')
188 self.checkequal(0, 'a' * 10, 'count', '\u0102')
189 self.checkequal(0, 'a' * 10, 'count', '\U00100304')
190 self.checkequal(0, '\u0102' * 10, 'count', '\U00100304')
191 self.checkequal(10, '\u0102' + 'a_' * 10, 'count', 'a_')
192 self.checkequal(10, '\U00100304' + 'a_' * 10, 'count', 'a_')
193 self.checkequal(10, '\U00100304' + '\u0102_' * 10, 'count', '\u0102_')
194 self.checkequal(0, 'a' * 10, 'count', 'a\u0102')
195 self.checkequal(0, 'a' * 10, 'count', 'a\U00100304')
196 self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000197
Walter Dörwald28256f22003-01-19 16:59:20 +0000198 def test_find(self):
Antoine Pitrouc0bbe7d2011-10-08 22:41:35 +0200199 string_tests.CommonTest.test_find(self)
Antoine Pitrou2c3b2302011-10-11 20:29:21 +0200200 # test implementation details of the memchr fast path
201 self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102')
202 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201')
203 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120')
204 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220')
205 self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304')
206 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204')
207 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004')
208 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000209 self.checkequalnofix(0, 'abcdefghiabc', 'find', 'abc')
210 self.checkequalnofix(9, 'abcdefghiabc', 'find', 'abc', 1)
211 self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000212
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000213 self.assertRaises(TypeError, 'hello'.find)
214 self.assertRaises(TypeError, 'hello'.find, 42)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200215 # test mixed kinds
216 self.checkequal(100, '\u0102' * 100 + 'a', 'find', 'a')
217 self.checkequal(100, '\U00100304' * 100 + 'a', 'find', 'a')
218 self.checkequal(100, '\U00100304' * 100 + '\u0102', 'find', '\u0102')
219 self.checkequal(-1, 'a' * 100, 'find', '\u0102')
220 self.checkequal(-1, 'a' * 100, 'find', '\U00100304')
221 self.checkequal(-1, '\u0102' * 100, 'find', '\U00100304')
222 self.checkequal(100, '\u0102' * 100 + 'a_', 'find', 'a_')
223 self.checkequal(100, '\U00100304' * 100 + 'a_', 'find', 'a_')
224 self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'find', '\u0102_')
225 self.checkequal(-1, 'a' * 100, 'find', 'a\u0102')
226 self.checkequal(-1, 'a' * 100, 'find', 'a\U00100304')
227 self.checkequal(-1, '\u0102' * 100, 'find', '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000228
Walter Dörwald28256f22003-01-19 16:59:20 +0000229 def test_rfind(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000230 string_tests.CommonTest.test_rfind(self)
Antoine Pitrou2c3b2302011-10-11 20:29:21 +0200231 # test implementation details of the memrchr fast path
232 self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102')
233 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201')
234 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120')
235 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220')
236 self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304')
237 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204')
238 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000239 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000240 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', 'abc')
241 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
242 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200243 # test mixed kinds
244 self.checkequal(0, 'a' + '\u0102' * 100, 'rfind', 'a')
245 self.checkequal(0, 'a' + '\U00100304' * 100, 'rfind', 'a')
246 self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rfind', '\u0102')
247 self.checkequal(-1, 'a' * 100, 'rfind', '\u0102')
248 self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304')
249 self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304')
250 self.checkequal(0, '_a' + '\u0102' * 100, 'rfind', '_a')
251 self.checkequal(0, '_a' + '\U00100304' * 100, 'rfind', '_a')
252 self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rfind', '_\u0102')
253 self.checkequal(-1, 'a' * 100, 'rfind', '\u0102a')
254 self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304a')
255 self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304\u0102')
Guido van Rossum8b264542000-12-19 02:22:31 +0000256
Walter Dörwald28256f22003-01-19 16:59:20 +0000257 def test_index(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000258 string_tests.CommonTest.test_index(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000259 self.checkequalnofix(0, 'abcdefghiabc', 'index', '')
260 self.checkequalnofix(3, 'abcdefghiabc', 'index', 'def')
261 self.checkequalnofix(0, 'abcdefghiabc', 'index', 'abc')
262 self.checkequalnofix(9, 'abcdefghiabc', 'index', 'abc', 1)
263 self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
264 self.assertRaises(ValueError, 'abcdefghiab'.index, 'abc', 1)
265 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', 8)
266 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', -1)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200267 # test mixed kinds
268 self.checkequal(100, '\u0102' * 100 + 'a', 'index', 'a')
269 self.checkequal(100, '\U00100304' * 100 + 'a', 'index', 'a')
270 self.checkequal(100, '\U00100304' * 100 + '\u0102', 'index', '\u0102')
271 self.assertRaises(ValueError, ('a' * 100).index, '\u0102')
272 self.assertRaises(ValueError, ('a' * 100).index, '\U00100304')
273 self.assertRaises(ValueError, ('\u0102' * 100).index, '\U00100304')
274 self.checkequal(100, '\u0102' * 100 + 'a_', 'index', 'a_')
275 self.checkequal(100, '\U00100304' * 100 + 'a_', 'index', 'a_')
276 self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'index', '\u0102_')
277 self.assertRaises(ValueError, ('a' * 100).index, 'a\u0102')
278 self.assertRaises(ValueError, ('a' * 100).index, 'a\U00100304')
279 self.assertRaises(ValueError, ('\u0102' * 100).index, '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000280
Walter Dörwald28256f22003-01-19 16:59:20 +0000281 def test_rindex(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000282 string_tests.CommonTest.test_rindex(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000283 self.checkequalnofix(12, 'abcdefghiabc', 'rindex', '')
284 self.checkequalnofix(3, 'abcdefghiabc', 'rindex', 'def')
285 self.checkequalnofix(9, 'abcdefghiabc', 'rindex', 'abc')
286 self.checkequalnofix(0, 'abcdefghiabc', 'rindex', 'abc', 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000287
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000288 self.assertRaises(ValueError, 'abcdefghiabc'.rindex, 'hib')
289 self.assertRaises(ValueError, 'defghiabc'.rindex, 'def', 1)
290 self.assertRaises(ValueError, 'defghiabc'.rindex, 'abc', 0, -1)
291 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, 8)
292 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, -1)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200293 # test mixed kinds
294 self.checkequal(0, 'a' + '\u0102' * 100, 'rindex', 'a')
295 self.checkequal(0, 'a' + '\U00100304' * 100, 'rindex', 'a')
296 self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rindex', '\u0102')
297 self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102')
298 self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304')
299 self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304')
300 self.checkequal(0, '_a' + '\u0102' * 100, 'rindex', '_a')
301 self.checkequal(0, '_a' + '\U00100304' * 100, 'rindex', '_a')
302 self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rindex', '_\u0102')
303 self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102a')
304 self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304a')
305 self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304\u0102')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000306
Georg Brandlceee0772007-11-27 23:48:05 +0000307 def test_maketrans_translate(self):
308 # these work with plain translate()
309 self.checkequalnofix('bbbc', 'abababc', 'translate',
310 {ord('a'): None})
311 self.checkequalnofix('iiic', 'abababc', 'translate',
312 {ord('a'): None, ord('b'): ord('i')})
313 self.checkequalnofix('iiix', 'abababc', 'translate',
314 {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
315 self.checkequalnofix('c', 'abababc', 'translate',
316 {ord('a'): None, ord('b'): ''})
317 self.checkequalnofix('xyyx', 'xzx', 'translate',
318 {ord('z'): 'yy'})
Victor Stinner5a29f252014-04-05 00:17:51 +0200319
Georg Brandlceee0772007-11-27 23:48:05 +0000320 # this needs maketrans()
321 self.checkequalnofix('abababc', 'abababc', 'translate',
322 {'b': '<i>'})
323 tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
324 self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
325 # test alternative way of calling maketrans()
326 tbl = self.type2test.maketrans('abc', 'xyz', 'd')
327 self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
328
Victor Stinner5a29f252014-04-05 00:17:51 +0200329 # various tests switching from ASCII to latin1 or the opposite;
330 # same length, remove a letter, or replace with a longer string.
331 self.assertEqual("[a]".translate(str.maketrans('a', 'X')),
332 "[X]")
333 self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})),
334 "[X]")
335 self.assertEqual("[a]".translate(str.maketrans({'a': None})),
336 "[]")
337 self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})),
338 "[XXX]")
339 self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})),
340 "[\xe9]")
341 self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})),
342 "[<\xe9>]")
343 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})),
344 "[a]")
345 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})),
346 "[]")
347
Victor Stinner4ff33af2014-04-05 11:56:37 +0200348 # invalid Unicode characters
349 invalid_char = 0x10ffff+1
350 for before in "a\xe9\u20ac\U0010ffff":
351 mapping = str.maketrans({before: invalid_char})
352 text = "[%s]" % before
353 self.assertRaises(ValueError, text.translate, mapping)
354
355 # errors
Georg Brandlceee0772007-11-27 23:48:05 +0000356 self.assertRaises(TypeError, self.type2test.maketrans)
357 self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
358 self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
359 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
360 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
361 self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
362 self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000363
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000364 self.assertRaises(TypeError, 'hello'.translate)
Walter Dörwald67e83882007-05-05 12:26:27 +0000365 self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000366
Walter Dörwald28256f22003-01-19 16:59:20 +0000367 def test_split(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000368 string_tests.CommonTest.test_split(self)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000369
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000370 # Mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000371 self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
372 self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
373 self.checkequalnofix(['endcase ', ''], 'endcase test', 'split', 'test')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200374 # test mixed kinds
375 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
376 left *= 9
377 right *= 9
378 for delim in ('c', '\u0102', '\U00010302'):
379 self.checkequal([left + right],
380 left + right, 'split', delim)
381 self.checkequal([left, right],
382 left + delim + right, 'split', delim)
383 self.checkequal([left + right],
384 left + right, 'split', delim * 2)
385 self.checkequal([left, right],
386 left + delim * 2 + right, 'split', delim *2)
387
388 def test_rsplit(self):
389 string_tests.CommonTest.test_rsplit(self)
390 # test mixed kinds
391 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
392 left *= 9
393 right *= 9
394 for delim in ('c', '\u0102', '\U00010302'):
395 self.checkequal([left + right],
396 left + right, 'rsplit', delim)
397 self.checkequal([left, right],
398 left + delim + right, 'rsplit', delim)
399 self.checkequal([left + right],
400 left + right, 'rsplit', delim * 2)
401 self.checkequal([left, right],
402 left + delim * 2 + right, 'rsplit', delim *2)
403
404 def test_partition(self):
405 string_tests.MixinStrUnicodeUserStringTest.test_partition(self)
406 # test mixed kinds
407 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
408 left *= 9
409 right *= 9
410 for delim in ('c', '\u0102', '\U00010302'):
411 self.checkequal((left + right, '', ''),
412 left + right, 'partition', delim)
413 self.checkequal((left, delim, right),
414 left + delim + right, 'partition', delim)
415 self.checkequal((left + right, '', ''),
416 left + right, 'partition', delim * 2)
417 self.checkequal((left, delim * 2, right),
418 left + delim * 2 + right, 'partition', delim * 2)
419
420 def test_rpartition(self):
421 string_tests.MixinStrUnicodeUserStringTest.test_rpartition(self)
422 # test mixed kinds
423 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
424 left *= 9
425 right *= 9
426 for delim in ('c', '\u0102', '\U00010302'):
427 self.checkequal(('', '', left + right),
428 left + right, 'rpartition', delim)
429 self.checkequal((left, delim, right),
430 left + delim + right, 'rpartition', delim)
431 self.checkequal(('', '', left + right),
432 left + right, 'rpartition', delim * 2)
433 self.checkequal((left, delim * 2, right),
434 left + delim * 2 + right, 'rpartition', delim * 2)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000435
Walter Dörwald28256f22003-01-19 16:59:20 +0000436 def test_join(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000437 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000438
Guido van Rossumf1044292007-09-27 18:01:22 +0000439 class MyWrapper:
440 def __init__(self, sval): self.sval = sval
441 def __str__(self): return self.sval
442
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000443 # mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000444 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
445 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
446 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
447 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
448 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
449 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
450 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
Guido van Rossum98297ee2007-11-06 21:34:58 +0000451 self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
452 self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
453 self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
454 self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
Marc-André Lemburge5034372000-08-08 08:04:29 +0000455
Walter Dörwald28256f22003-01-19 16:59:20 +0000456 def test_replace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000457 string_tests.CommonTest.test_replace(self)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000458
Walter Dörwald28256f22003-01-19 16:59:20 +0000459 # method call forwarded from str implementation because of unicode argument
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000460 self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
461 self.assertRaises(TypeError, 'replace'.replace, "r", 42)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200462 # test mixed kinds
463 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
464 left *= 9
465 right *= 9
466 for delim in ('c', '\u0102', '\U00010302'):
467 for repl in ('d', '\u0103', '\U00010303'):
468 self.checkequal(left + right,
469 left + right, 'replace', delim, repl)
470 self.checkequal(left + repl + right,
471 left + delim + right,
472 'replace', delim, repl)
473 self.checkequal(left + right,
474 left + right, 'replace', delim * 2, repl)
475 self.checkequal(left + repl + right,
476 left + delim * 2 + right,
477 'replace', delim * 2, repl)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000478
Victor Stinner59de0ee2011-10-07 10:01:28 +0200479 @support.cpython_only
480 def test_replace_id(self):
Victor Stinner1d972ad2011-10-07 13:31:46 +0200481 pattern = 'abc'
482 text = 'abc def'
483 self.assertIs(text.replace(pattern, pattern), text)
Victor Stinner59de0ee2011-10-07 10:01:28 +0200484
Guido van Rossum98297ee2007-11-06 21:34:58 +0000485 def test_bytes_comparison(self):
Brett Cannon226b2302010-03-20 22:22:22 +0000486 with support.check_warnings():
487 warnings.simplefilter('ignore', BytesWarning)
488 self.assertEqual('abc' == b'abc', False)
489 self.assertEqual('abc' != b'abc', True)
490 self.assertEqual('abc' == bytearray(b'abc'), False)
491 self.assertEqual('abc' != bytearray(b'abc'), True)
Brett Cannon40430012007-10-22 20:24:51 +0000492
Walter Dörwald28256f22003-01-19 16:59:20 +0000493 def test_comparison(self):
494 # Comparisons:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000495 self.assertEqual('abc', 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000496 self.assertTrue('abcd' > 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000497 self.assertTrue('abc' < 'abcd')
Walter Dörwald28256f22003-01-19 16:59:20 +0000498
499 if 0:
500 # Move these tests to a Unicode collation module test...
501 # Testing UTF-16 code point order comparisons...
502
503 # No surrogates, no fixup required.
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000504 self.assertTrue('\u0061' < '\u20ac')
Walter Dörwald28256f22003-01-19 16:59:20 +0000505 # Non surrogate below surrogate value, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000506 self.assertTrue('\u0061' < '\ud800\udc02')
Walter Dörwald28256f22003-01-19 16:59:20 +0000507
508 # Non surrogate above surrogate value, fixup required
509 def test_lecmp(s, s2):
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000510 self.assertTrue(s < s2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000511
512 def test_fixup(s):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000513 s2 = '\ud800\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000514 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000515 s2 = '\ud900\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000516 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000517 s2 = '\uda00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000518 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000519 s2 = '\udb00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000520 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000521 s2 = '\ud800\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000522 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000523 s2 = '\ud900\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000524 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000525 s2 = '\uda00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000526 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000527 s2 = '\udb00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000528 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000529 s2 = '\ud800\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000530 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000531 s2 = '\ud900\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000532 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000533 s2 = '\uda00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000534 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000535 s2 = '\udb00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000536 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000537 s2 = '\ud800\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000538 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000539 s2 = '\ud900\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000540 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000541 s2 = '\uda00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000542 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000543 s2 = '\udb00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000544 test_lecmp(s, s2)
545
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000546 test_fixup('\ue000')
547 test_fixup('\uff61')
Walter Dörwald28256f22003-01-19 16:59:20 +0000548
549 # Surrogates on both sides, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000550 self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
Walter Dörwald28256f22003-01-19 16:59:20 +0000551
Walter Dörwald28256f22003-01-19 16:59:20 +0000552 def test_islower(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000553 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000554 self.checkequalnofix(False, '\u1FFc', 'islower')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500555 self.assertFalse('\u2167'.islower())
556 self.assertTrue('\u2177'.islower())
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300557 # non-BMP, uppercase
558 self.assertFalse('\U00010401'.islower())
559 self.assertFalse('\U00010427'.islower())
560 # non-BMP, lowercase
561 self.assertTrue('\U00010429'.islower())
562 self.assertTrue('\U0001044E'.islower())
563 # non-BMP, non-cased
564 self.assertFalse('\U0001F40D'.islower())
565 self.assertFalse('\U0001F46F'.islower())
Walter Dörwald28256f22003-01-19 16:59:20 +0000566
567 def test_isupper(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000568 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
569 if not sys.platform.startswith('java'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000570 self.checkequalnofix(False, '\u1FFc', 'isupper')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500571 self.assertTrue('\u2167'.isupper())
572 self.assertFalse('\u2177'.isupper())
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300573 # non-BMP, uppercase
574 self.assertTrue('\U00010401'.isupper())
575 self.assertTrue('\U00010427'.isupper())
576 # non-BMP, lowercase
577 self.assertFalse('\U00010429'.isupper())
578 self.assertFalse('\U0001044E'.isupper())
579 # non-BMP, non-cased
580 self.assertFalse('\U0001F40D'.isupper())
581 self.assertFalse('\U0001F46F'.isupper())
Walter Dörwald28256f22003-01-19 16:59:20 +0000582
583 def test_istitle(self):
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300584 string_tests.MixinStrUnicodeUserStringTest.test_istitle(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000585 self.checkequalnofix(True, '\u1FFc', 'istitle')
586 self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
Walter Dörwald28256f22003-01-19 16:59:20 +0000587
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300588 # non-BMP, uppercase + lowercase
589 self.assertTrue('\U00010401\U00010429'.istitle())
590 self.assertTrue('\U00010427\U0001044E'.istitle())
591 # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
592 for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
593 self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
594
Walter Dörwald28256f22003-01-19 16:59:20 +0000595 def test_isspace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000596 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000597 self.checkequalnofix(True, '\u2000', 'isspace')
598 self.checkequalnofix(True, '\u200a', 'isspace')
599 self.checkequalnofix(False, '\u2014', 'isspace')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300600 # apparently there are no non-BMP spaces chars in Unicode 6
601 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
602 '\U0001F40D', '\U0001F46F']:
603 self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
604
605 def test_isalnum(self):
606 string_tests.MixinStrUnicodeUserStringTest.test_isalnum(self)
607 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
608 '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
609 self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
Walter Dörwald28256f22003-01-19 16:59:20 +0000610
611 def test_isalpha(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000612 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000613 self.checkequalnofix(True, '\u1FFc', 'isalpha')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300614 # non-BMP, cased
615 self.assertTrue('\U00010401'.isalpha())
616 self.assertTrue('\U00010427'.isalpha())
617 self.assertTrue('\U00010429'.isalpha())
618 self.assertTrue('\U0001044E'.isalpha())
619 # non-BMP, non-cased
620 self.assertFalse('\U0001F40D'.isalpha())
621 self.assertFalse('\U0001F46F'.isalpha())
Walter Dörwald28256f22003-01-19 16:59:20 +0000622
623 def test_isdecimal(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000624 self.checkequalnofix(False, '', 'isdecimal')
625 self.checkequalnofix(False, 'a', 'isdecimal')
626 self.checkequalnofix(True, '0', 'isdecimal')
627 self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
628 self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
629 self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
630 self.checkequalnofix(True, '0123456789', 'isdecimal')
631 self.checkequalnofix(False, '0123456789a', 'isdecimal')
Walter Dörwald28256f22003-01-19 16:59:20 +0000632
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000633 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000634
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300635 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
636 '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
637 self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
638 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
639 self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
640
Walter Dörwald28256f22003-01-19 16:59:20 +0000641 def test_isdigit(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000642 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000643 self.checkequalnofix(True, '\u2460', 'isdigit')
644 self.checkequalnofix(False, '\xbc', 'isdigit')
645 self.checkequalnofix(True, '\u0660', 'isdigit')
Walter Dörwald28256f22003-01-19 16:59:20 +0000646
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300647 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
648 '\U0001F40D', '\U0001F46F', '\U00011065']:
649 self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
650 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
651 self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
652
Walter Dörwald28256f22003-01-19 16:59:20 +0000653 def test_isnumeric(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000654 self.checkequalnofix(False, '', 'isnumeric')
655 self.checkequalnofix(False, 'a', 'isnumeric')
656 self.checkequalnofix(True, '0', 'isnumeric')
657 self.checkequalnofix(True, '\u2460', 'isnumeric')
658 self.checkequalnofix(True, '\xbc', 'isnumeric')
659 self.checkequalnofix(True, '\u0660', 'isnumeric')
660 self.checkequalnofix(True, '0123456789', 'isnumeric')
661 self.checkequalnofix(False, '0123456789a', 'isnumeric')
Walter Dörwald28256f22003-01-19 16:59:20 +0000662
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000663 self.assertRaises(TypeError, "abc".isnumeric, 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000664
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300665 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
666 '\U0001F40D', '\U0001F46F']:
667 self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
668 for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
669 '\U000104A0', '\U0001F107']:
670 self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
671
Martin v. Löwis47383402007-08-15 07:32:56 +0000672 def test_isidentifier(self):
673 self.assertTrue("a".isidentifier())
674 self.assertTrue("Z".isidentifier())
675 self.assertTrue("_".isidentifier())
676 self.assertTrue("b0".isidentifier())
677 self.assertTrue("bc".isidentifier())
678 self.assertTrue("b_".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000679 self.assertTrue("µ".isidentifier())
Benjamin Petersonf413b802011-08-12 22:17:18 -0500680 self.assertTrue("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000681
682 self.assertFalse(" ".isidentifier())
683 self.assertFalse("[".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000684 self.assertFalse("©".isidentifier())
Georg Brandld52429f2008-07-04 15:55:02 +0000685 self.assertFalse("0".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000686
Georg Brandl559e5d72008-06-11 18:37:52 +0000687 def test_isprintable(self):
688 self.assertTrue("".isprintable())
Benjamin Peterson09832742009-03-26 17:15:46 +0000689 self.assertTrue(" ".isprintable())
Georg Brandl559e5d72008-06-11 18:37:52 +0000690 self.assertTrue("abcdefg".isprintable())
691 self.assertFalse("abcdefg\n".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000692 # some defined Unicode character
693 self.assertTrue("\u0374".isprintable())
694 # undefined character
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +0000695 self.assertFalse("\u0378".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000696 # single surrogate character
Georg Brandl559e5d72008-06-11 18:37:52 +0000697 self.assertFalse("\ud800".isprintable())
698
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300699 self.assertTrue('\U0001F46F'.isprintable())
700 self.assertFalse('\U000E0020'.isprintable())
701
702 def test_surrogates(self):
703 for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
704 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
705 self.assertTrue(s.islower())
706 self.assertFalse(s.isupper())
707 self.assertFalse(s.istitle())
708 for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
709 'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
710 self.assertFalse(s.islower())
711 self.assertTrue(s.isupper())
712 self.assertTrue(s.istitle())
713
714 for meth_name in ('islower', 'isupper', 'istitle'):
715 meth = getattr(str, meth_name)
716 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
717 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
718
719 for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
720 'isdecimal', 'isnumeric',
721 'isidentifier', 'isprintable'):
722 meth = getattr(str, meth_name)
723 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
724 'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
725 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
726 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
727
728
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300729 def test_lower(self):
730 string_tests.CommonTest.test_lower(self)
731 self.assertEqual('\U00010427'.lower(), '\U0001044F')
732 self.assertEqual('\U00010427\U00010427'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300733 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300734 self.assertEqual('\U00010427\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300735 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300736 self.assertEqual('X\U00010427x\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300737 'x\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500738 self.assertEqual('fi'.lower(), 'fi')
739 self.assertEqual('\u0130'.lower(), '\u0069\u0307')
740 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
741 self.assertEqual('\u03a3'.lower(), '\u03c3')
742 self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
743 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
744 self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
745 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
746 self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
747 self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
748 self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
749 self.assertEqual('\u2177'.lower(), '\u2177')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300750
Benjamin Petersond5890c82012-01-14 13:23:30 -0500751 def test_casefold(self):
752 self.assertEqual('hello'.casefold(), 'hello')
753 self.assertEqual('hELlo'.casefold(), 'hello')
754 self.assertEqual('ß'.casefold(), 'ss')
755 self.assertEqual('fi'.casefold(), 'fi')
756 self.assertEqual('\u03a3'.casefold(), '\u03c3')
757 self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
Benjamin Peterson4eda9372012-08-05 15:05:34 -0700758 self.assertEqual('\u00b5'.casefold(), '\u03bc')
Benjamin Petersond5890c82012-01-14 13:23:30 -0500759
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300760 def test_upper(self):
761 string_tests.CommonTest.test_upper(self)
762 self.assertEqual('\U0001044F'.upper(), '\U00010427')
763 self.assertEqual('\U0001044F\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300764 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300765 self.assertEqual('\U00010427\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300766 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300767 self.assertEqual('X\U00010427x\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300768 'X\U00010427X\U00010427')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500769 self.assertEqual('fi'.upper(), 'FI')
770 self.assertEqual('\u0130'.upper(), '\u0130')
771 self.assertEqual('\u03a3'.upper(), '\u03a3')
772 self.assertEqual('ß'.upper(), 'SS')
773 self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
774 self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
775 self.assertEqual('\u2177'.upper(), '\u2167')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300776
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300777 def test_capitalize(self):
778 string_tests.CommonTest.test_capitalize(self)
779 self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
780 self.assertEqual('\U0001044F\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300781 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300782 self.assertEqual('\U00010427\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300783 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300784 self.assertEqual('\U0001044F\U00010427'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300785 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300786 self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300787 'X\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500788 self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
789 exp = '\u0399\u0308\u0300\u0069\u0307'
790 self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
791 self.assertEqual('finnish'.capitalize(), 'FInnish')
792 self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300793
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300794 def test_title(self):
795 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
796 self.assertEqual('\U0001044F'.title(), '\U00010427')
797 self.assertEqual('\U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300798 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300799 self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300800 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300801 self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300802 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300803 self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300804 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300805 self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300806 'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500807 self.assertEqual('fiNNISH'.title(), 'Finnish')
808 self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
809 self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300810
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300811 def test_swapcase(self):
812 string_tests.CommonTest.test_swapcase(self)
813 self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
814 self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
815 self.assertEqual('\U0001044F\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300816 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300817 self.assertEqual('\U00010427\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300818 '\U0001044F\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300819 self.assertEqual('\U0001044F\U00010427'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300820 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300821 self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300822 'x\U0001044FX\U00010427')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500823 self.assertEqual('fi'.swapcase(), 'FI')
824 self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
825 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
826 self.assertEqual('\u03a3'.swapcase(), '\u03c3')
827 self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
828 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
829 self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
830 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
831 self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
832 self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
833 self.assertEqual('\u03a3'.swapcase(), '\u03c3')
834 self.assertEqual('ß'.swapcase(), 'SS')
835 self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300836
Ezio Melottif84e01d2013-07-08 17:48:29 +0200837 def test_center(self):
838 string_tests.CommonTest.test_center(self)
839 self.assertEqual('x'.center(2, '\U0010FFFF'),
840 'x\U0010FFFF')
841 self.assertEqual('x'.center(3, '\U0010FFFF'),
842 '\U0010FFFFx\U0010FFFF')
843 self.assertEqual('x'.center(4, '\U0010FFFF'),
844 '\U0010FFFFx\U0010FFFF\U0010FFFF')
845
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400846 @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system")
Benjamin Peterson4d856892014-10-15 13:39:46 -0400847 @support.cpython_only
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400848 def test_case_operation_overflow(self):
849 # Issue #22643
850 self.assertRaises(OverflowError, ("ü"*(2**32//12 + 1)).upper)
851
Walter Dörwald28256f22003-01-19 16:59:20 +0000852 def test_contains(self):
853 # Testing Unicode contains method
Benjamin Peterson577473f2010-01-19 00:09:57 +0000854 self.assertIn('a', 'abdb')
855 self.assertIn('a', 'bdab')
856 self.assertIn('a', 'bdaba')
857 self.assertIn('a', 'bdba')
858 self.assertNotIn('a', 'bdb')
859 self.assertIn('a', 'bdba')
860 self.assertIn('a', ('a',1,None))
861 self.assertIn('a', (1,None,'a'))
862 self.assertIn('a', ('a',1,None))
863 self.assertIn('a', (1,None,'a'))
864 self.assertNotIn('a', ('x',1,'y'))
865 self.assertNotIn('a', ('x',1,None))
866 self.assertNotIn('abcd', 'abcxxxx')
867 self.assertIn('ab', 'abcd')
868 self.assertIn('ab', 'abc')
869 self.assertIn('ab', (1,None,'ab'))
870 self.assertIn('', 'abc')
871 self.assertIn('', '')
872 self.assertIn('', 'abc')
873 self.assertNotIn('\0', 'abc')
874 self.assertIn('\0', '\0abc')
875 self.assertIn('\0', 'abc\0')
876 self.assertIn('a', '\0abc')
877 self.assertIn('asdf', 'asdf')
878 self.assertNotIn('asdf', 'asd')
879 self.assertNotIn('asdf', '')
Walter Dörwald28256f22003-01-19 16:59:20 +0000880
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000881 self.assertRaises(TypeError, "abc".__contains__)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200882 # test mixed kinds
883 for fill in ('a', '\u0100', '\U00010300'):
884 fill *= 9
885 for delim in ('c', '\u0102', '\U00010302'):
886 self.assertNotIn(delim, fill)
887 self.assertIn(delim, fill + delim)
888 self.assertNotIn(delim * 2, fill)
889 self.assertIn(delim * 2, fill + delim * 2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000890
Serhiy Storchaka31b1c8b2013-06-12 09:20:44 +0300891 def test_issue18183(self):
892 '\U00010000\U00100000'.lower()
893 '\U00010000\U00100000'.casefold()
894 '\U00010000\U00100000'.upper()
895 '\U00010000\U00100000'.capitalize()
896 '\U00010000\U00100000'.title()
897 '\U00010000\U00100000'.swapcase()
898 '\U00100000'.center(3, '\U00010000')
899 '\U00100000'.ljust(3, '\U00010000')
900 '\U00100000'.rjust(3, '\U00010000')
901
Eric Smith8c663262007-08-25 02:26:07 +0000902 def test_format(self):
903 self.assertEqual(''.format(), '')
904 self.assertEqual('a'.format(), 'a')
905 self.assertEqual('ab'.format(), 'ab')
906 self.assertEqual('a{{'.format(), 'a{')
907 self.assertEqual('a}}'.format(), 'a}')
908 self.assertEqual('{{b'.format(), '{b')
909 self.assertEqual('}}b'.format(), '}b')
910 self.assertEqual('a{{b'.format(), 'a{b')
911
912 # examples from the PEP:
913 import datetime
914 self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
915 self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
916 "My name is Fred")
917 self.assertEqual("My name is {0} :-{{}}".format('Fred'),
918 "My name is Fred :-{}")
919
920 d = datetime.date(2007, 8, 18)
921 self.assertEqual("The year is {0.year}".format(d),
922 "The year is 2007")
923
Eric Smith8c663262007-08-25 02:26:07 +0000924 # classes we'll use for testing
925 class C:
926 def __init__(self, x=100):
927 self._x = x
928 def __format__(self, spec):
929 return spec
930
931 class D:
932 def __init__(self, x):
933 self.x = x
934 def __format__(self, spec):
935 return str(self.x)
936
937 # class with __str__, but no __format__
938 class E:
939 def __init__(self, x):
940 self.x = x
941 def __str__(self):
942 return 'E(' + self.x + ')'
943
944 # class with __repr__, but no __format__ or __str__
945 class F:
946 def __init__(self, x):
947 self.x = x
948 def __repr__(self):
949 return 'F(' + self.x + ')'
950
951 # class with __format__ that forwards to string, for some format_spec's
952 class G:
953 def __init__(self, x):
954 self.x = x
955 def __str__(self):
956 return "string is " + self.x
957 def __format__(self, format_spec):
958 if format_spec == 'd':
959 return 'G(' + self.x + ')'
960 return object.__format__(self, format_spec)
961
Eric Smith739e2ad2007-08-27 19:07:22 +0000962 class I(datetime.date):
963 def __format__(self, format_spec):
964 return self.strftime(format_spec)
965
Eric Smith185e30c2007-08-30 22:23:08 +0000966 class J(int):
967 def __format__(self, format_spec):
968 return int.__format__(self * 2, format_spec)
969
Eric Smith8c663262007-08-25 02:26:07 +0000970
971 self.assertEqual(''.format(), '')
972 self.assertEqual('abc'.format(), 'abc')
973 self.assertEqual('{0}'.format('abc'), 'abc')
974 self.assertEqual('{0:}'.format('abc'), 'abc')
975# self.assertEqual('{ 0 }'.format('abc'), 'abc')
976 self.assertEqual('X{0}'.format('abc'), 'Xabc')
977 self.assertEqual('{0}X'.format('abc'), 'abcX')
978 self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
979 self.assertEqual('{1}'.format(1, 'abc'), 'abc')
980 self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
981 self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
982 self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
983 self.assertEqual('{0}'.format(-15), '-15')
984 self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
985 self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
986 self.assertEqual('{{'.format(), '{')
987 self.assertEqual('}}'.format(), '}')
988 self.assertEqual('{{}}'.format(), '{}')
989 self.assertEqual('{{x}}'.format(), '{x}')
990 self.assertEqual('{{{0}}}'.format(123), '{123}')
991 self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
992 self.assertEqual('}}{{'.format(), '}{')
993 self.assertEqual('}}x{{'.format(), '}x{')
994
Eric Smith7ade6482007-08-26 22:27:13 +0000995 # weird field names
996 self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
997 self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
Eric Smith4cb4e4e2007-09-03 08:40:29 +0000998 self.assertEqual("{0[ ]}".format({' ':3}), '3')
Eric Smith7ade6482007-08-26 22:27:13 +0000999
Eric Smith8c663262007-08-25 02:26:07 +00001000 self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
1001 self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
1002 self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
1003 self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
1004 self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
1005 self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
1006 self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
1007
Eric Smith8c663262007-08-25 02:26:07 +00001008 # strings
1009 self.assertEqual('{0:.3s}'.format('abc'), 'abc')
1010 self.assertEqual('{0:.3s}'.format('ab'), 'ab')
1011 self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
1012 self.assertEqual('{0:.0s}'.format('abcdef'), '')
1013 self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
1014 self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
1015 self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
1016 self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
1017 self.assertEqual('{0:x<0s}'.format('result'), 'result')
1018 self.assertEqual('{0:x<5s}'.format('result'), 'result')
1019 self.assertEqual('{0:x<6s}'.format('result'), 'result')
1020 self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
1021 self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
1022 self.assertEqual('{0: <7s}'.format('result'), 'result ')
1023 self.assertEqual('{0:<7s}'.format('result'), 'result ')
1024 self.assertEqual('{0:>7s}'.format('result'), ' result')
1025 self.assertEqual('{0:>8s}'.format('result'), ' result')
1026 self.assertEqual('{0:^8s}'.format('result'), ' result ')
1027 self.assertEqual('{0:^9s}'.format('result'), ' result ')
1028 self.assertEqual('{0:^10s}'.format('result'), ' result ')
1029 self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
1030 self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
1031 self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
1032
Eric V. Smith2ea97122014-04-14 11:55:10 -04001033 # issue 12546: use \x00 as a fill character
1034 self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
1035 self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
1036 self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
1037 self.assertEqual('{0:^6s}'.format('foo'), ' foo ')
1038
1039 self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
1040 self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
1041 self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
1042 self.assertEqual('{0:<6}'.format(3), '3 ')
1043
1044 self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
1045 self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
1046 self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
1047 self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
1048
1049 self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
1050 self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
1051 self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
1052 self.assertEqual('{0:^12}'.format(3+2.0j), ' (3+2j) ')
1053
Eric Smith8c663262007-08-25 02:26:07 +00001054 # format specifiers for user defined type
1055 self.assertEqual('{0:abc}'.format(C()), 'abc')
1056
Georg Brandld52429f2008-07-04 15:55:02 +00001057 # !r, !s and !a coercions
Eric Smith8c663262007-08-25 02:26:07 +00001058 self.assertEqual('{0!s}'.format('Hello'), 'Hello')
1059 self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
1060 self.assertEqual('{0!s:15}'.format('Hello'), 'Hello ')
1061 self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello ')
1062 self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
1063 self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
1064 self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +00001065 self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +00001066 self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'") # printable
1067 self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
Georg Brandl559e5d72008-06-11 18:37:52 +00001068 self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +00001069 self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +00001070 self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
Georg Brandl559e5d72008-06-11 18:37:52 +00001071 self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
1072 self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
Georg Brandld52429f2008-07-04 15:55:02 +00001073 self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
Eric Smith8c663262007-08-25 02:26:07 +00001074
Eric Smith8c663262007-08-25 02:26:07 +00001075 # test fallback to object.__format__
1076 self.assertEqual('{0}'.format({}), '{}')
1077 self.assertEqual('{0}'.format([]), '[]')
1078 self.assertEqual('{0}'.format([1]), '[1]')
Eric Smithe4d63172010-09-13 20:48:43 +00001079
Eric Smith8c663262007-08-25 02:26:07 +00001080 self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
Eric Smith8c663262007-08-25 02:26:07 +00001081 self.assertEqual('{0!s}'.format(G('data')), 'string is data')
1082
Andrew Svetlov2cd8ce42012-12-23 14:27:17 +02001083 self.assertRaises(TypeError, '{0:^10}'.format, E('data'))
1084 self.assertRaises(TypeError, '{0:^10s}'.format, E('data'))
1085 self.assertRaises(TypeError, '{0:>15s}'.format, G('data'))
Eric Smithe4d63172010-09-13 20:48:43 +00001086
Eric Smith739e2ad2007-08-27 19:07:22 +00001087 self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
1088 month=8,
1089 day=27)),
1090 "date: 2007-08-27")
1091
Eric Smith185e30c2007-08-30 22:23:08 +00001092 # test deriving from a builtin type and overriding __format__
1093 self.assertEqual("{0}".format(J(10)), "20")
1094
1095
Eric Smith8c663262007-08-25 02:26:07 +00001096 # string format specifiers
1097 self.assertEqual('{0:}'.format('a'), 'a')
1098
1099 # computed format specifiers
1100 self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
1101 self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
1102 self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
1103 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello ')
1104 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello ')
1105
1106 # test various errors
1107 self.assertRaises(ValueError, '{'.format)
1108 self.assertRaises(ValueError, '}'.format)
1109 self.assertRaises(ValueError, 'a{'.format)
1110 self.assertRaises(ValueError, 'a}'.format)
1111 self.assertRaises(ValueError, '{a'.format)
1112 self.assertRaises(ValueError, '}a'.format)
Eric Smith11529192007-09-04 23:04:22 +00001113 self.assertRaises(IndexError, '{0}'.format)
1114 self.assertRaises(IndexError, '{1}'.format, 'abc')
1115 self.assertRaises(KeyError, '{x}'.format)
Eric Smith8c663262007-08-25 02:26:07 +00001116 self.assertRaises(ValueError, "}{".format)
Eric Smith8c663262007-08-25 02:26:07 +00001117 self.assertRaises(ValueError, "abc{0:{}".format)
1118 self.assertRaises(ValueError, "{0".format)
Eric Smith11529192007-09-04 23:04:22 +00001119 self.assertRaises(IndexError, "{0.}".format)
1120 self.assertRaises(ValueError, "{0.}".format, 0)
Benjamin Peterson4d944742013-05-17 18:22:31 -05001121 self.assertRaises(ValueError, "{0[}".format)
Eric Smith4cb4e4e2007-09-03 08:40:29 +00001122 self.assertRaises(ValueError, "{0[}".format, [])
Eric Smith11529192007-09-04 23:04:22 +00001123 self.assertRaises(KeyError, "{0]}".format)
1124 self.assertRaises(ValueError, "{0.[]}".format, 0)
Eric Smith7ade6482007-08-26 22:27:13 +00001125 self.assertRaises(ValueError, "{0..foo}".format, 0)
Eric Smith11529192007-09-04 23:04:22 +00001126 self.assertRaises(ValueError, "{0[0}".format, 0)
1127 self.assertRaises(ValueError, "{0[0:foo}".format, 0)
1128 self.assertRaises(KeyError, "{c]}".format)
1129 self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
1130 self.assertRaises(ValueError, "{0}}".format, 0)
1131 self.assertRaises(KeyError, "{foo}".format, bar=3)
Eric Smith8c663262007-08-25 02:26:07 +00001132 self.assertRaises(ValueError, "{0!x}".format, 3)
Eric Smith11529192007-09-04 23:04:22 +00001133 self.assertRaises(ValueError, "{0!}".format, 0)
1134 self.assertRaises(ValueError, "{0!rs}".format, 0)
Eric Smith8c663262007-08-25 02:26:07 +00001135 self.assertRaises(ValueError, "{!}".format)
Eric Smith8ec90442009-03-14 12:29:34 +00001136 self.assertRaises(IndexError, "{:}".format)
1137 self.assertRaises(IndexError, "{:s}".format)
1138 self.assertRaises(IndexError, "{}".format)
Benjamin Peterson59a1b2f2010-06-07 22:31:26 +00001139 big = "23098475029384702983476098230754973209482573"
1140 self.assertRaises(ValueError, ("{" + big + "}").format)
1141 self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
Eric Smith8c663262007-08-25 02:26:07 +00001142
Eric Smith41669ca2009-05-23 14:23:22 +00001143 # issue 6089
1144 self.assertRaises(ValueError, "{0[0]x}".format, [None])
1145 self.assertRaises(ValueError, "{0[0](10)}".format, [None])
1146
Eric Smith8c663262007-08-25 02:26:07 +00001147 # can't have a replacement on the field name portion
1148 self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
1149
1150 # exceed maximum recursion depth
1151 self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
1152 self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1153 0, 1, 2, 3, 4, 5, 6, 7)
1154
1155 # string format spec errors
1156 self.assertRaises(ValueError, "{0:-s}".format, '')
1157 self.assertRaises(ValueError, format, "", "-")
1158 self.assertRaises(ValueError, "{0:=s}".format, '')
1159
Eric Smithb1ebcc62008-07-15 13:02:41 +00001160 # Alternate formatting is not supported
1161 self.assertRaises(ValueError, format, '', '#')
1162 self.assertRaises(ValueError, format, '', '#20')
1163
Victor Stinnerece58de2012-04-23 23:36:38 +02001164 # Non-ASCII
1165 self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"),
1166 'ABC\u0410\u0411\u0412')
1167 self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"),
1168 'ABC')
1169 self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"),
1170 '')
1171
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001172 self.assertEqual("{[{}]}".format({"{}": 5}), "5")
Benjamin Peterson4d944742013-05-17 18:22:31 -05001173 self.assertEqual("{[{}]}".format({"{}" : "a"}), "a")
1174 self.assertEqual("{[{]}".format({"{" : "a"}), "a")
1175 self.assertEqual("{[}]}".format({"}" : "a"}), "a")
1176 self.assertEqual("{[[]}".format({"[" : "a"}), "a")
1177 self.assertEqual("{[!]}".format({"!" : "a"}), "a")
1178 self.assertRaises(ValueError, "{a{}b}".format, 42)
1179 self.assertRaises(ValueError, "{a{b}".format, 42)
1180 self.assertRaises(ValueError, "{[}".format, 42)
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001181
Benjamin Peterson0ee22bf2013-11-26 19:22:36 -06001182 self.assertEqual("0x{:0{:d}X}".format(0x0,16), "0x0000000000000000")
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001183
Eric Smith27bbca62010-11-04 17:06:58 +00001184 def test_format_map(self):
1185 self.assertEqual(''.format_map({}), '')
1186 self.assertEqual('a'.format_map({}), 'a')
1187 self.assertEqual('ab'.format_map({}), 'ab')
1188 self.assertEqual('a{{'.format_map({}), 'a{')
1189 self.assertEqual('a}}'.format_map({}), 'a}')
1190 self.assertEqual('{{b'.format_map({}), '{b')
1191 self.assertEqual('}}b'.format_map({}), '}b')
1192 self.assertEqual('a{{b'.format_map({}), 'a{b')
1193
1194 # using mappings
1195 class Mapping(dict):
1196 def __missing__(self, key):
1197 return key
1198 self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
1199 self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
1200
1201 class InternalMapping:
1202 def __init__(self):
1203 self.mapping = {'a': 'hello'}
1204 def __getitem__(self, key):
1205 return self.mapping[key]
1206 self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
1207
1208
Eric Smith27bbca62010-11-04 17:06:58 +00001209 class C:
1210 def __init__(self, x=100):
1211 self._x = x
1212 def __format__(self, spec):
1213 return spec
Eric Smith27bbca62010-11-04 17:06:58 +00001214 self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
1215
1216 # test various errors
Eric V. Smithedbb6ca2012-03-12 15:16:22 -07001217 self.assertRaises(TypeError, ''.format_map)
1218 self.assertRaises(TypeError, 'a'.format_map)
1219
1220 self.assertRaises(ValueError, '{'.format_map, {})
1221 self.assertRaises(ValueError, '}'.format_map, {})
1222 self.assertRaises(ValueError, 'a{'.format_map, {})
1223 self.assertRaises(ValueError, 'a}'.format_map, {})
1224 self.assertRaises(ValueError, '{a'.format_map, {})
1225 self.assertRaises(ValueError, '}a'.format_map, {})
Eric Smith27bbca62010-11-04 17:06:58 +00001226
Eric V. Smith12ebefc2011-07-18 14:03:41 -04001227 # issue #12579: can't supply positional params to format_map
1228 self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
1229 self.assertRaises(ValueError, '{}'.format_map, 'a')
1230 self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
1231
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001232 def test_format_huge_precision(self):
1233 format_string = ".{}f".format(sys.maxsize + 1)
1234 with self.assertRaises(ValueError):
1235 result = format(2.34, format_string)
1236
1237 def test_format_huge_width(self):
1238 format_string = "{}f".format(sys.maxsize + 1)
1239 with self.assertRaises(ValueError):
1240 result = format(2.34, format_string)
1241
1242 def test_format_huge_item_number(self):
1243 format_string = "{{{}:.6f}}".format(sys.maxsize + 1)
1244 with self.assertRaises(ValueError):
1245 result = format_string.format(2.34)
1246
Eric Smith8ec90442009-03-14 12:29:34 +00001247 def test_format_auto_numbering(self):
1248 class C:
1249 def __init__(self, x=100):
1250 self._x = x
1251 def __format__(self, spec):
1252 return spec
1253
1254 self.assertEqual('{}'.format(10), '10')
1255 self.assertEqual('{:5}'.format('s'), 's ')
1256 self.assertEqual('{!r}'.format('s'), "'s'")
1257 self.assertEqual('{._x}'.format(C(10)), '10')
1258 self.assertEqual('{[1]}'.format([1, 2]), '2')
1259 self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
1260 self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
1261
1262 self.assertEqual('a{:{}}b'.format('x', '^10'), 'a x b')
1263 self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
1264
1265 # can't mix and match numbering and auto-numbering
1266 self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
1267 self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
1268 self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
1269 self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
1270
1271 # can mix and match auto-numbering and named
1272 self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
1273 self.assertEqual('{}{f}'.format(4, f='test'), '4test')
1274 self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
1275 self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
1276
Walter Dörwald28256f22003-01-19 16:59:20 +00001277 def test_formatting(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +00001278 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
Walter Dörwald28256f22003-01-19 16:59:20 +00001279 # Testing Unicode formatting strings...
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001280 self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
1281 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000, 3.00')
1282 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000, 3.00')
1283 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000, 3.50')
1284 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000, 3.57')
1285 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
Walter Dörwald28256f22003-01-19 16:59:20 +00001286 if not sys.platform.startswith('java'):
Walter Dörwald67e83882007-05-05 12:26:27 +00001287 self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
Georg Brandl559e5d72008-06-11 18:37:52 +00001288 self.assertEqual("%r" % ("\u1234",), "'\u1234'")
1289 self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001290 self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
1291 self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
Walter Dörwald56fbcb52003-03-31 18:18:41 +00001292
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001293 self.assertEqual('%c' % 0x1234, '\u1234')
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00001294 self.assertEqual('%c' % 0x21483, '\U00021483')
1295 self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
1296 self.assertEqual('%c' % '\U00021483', '\U00021483')
1297 self.assertRaises(TypeError, "%c".__mod__, "aa")
Stefan Krah99212f62010-07-19 17:58:26 +00001298 self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -07001299 self.assertRaises(TypeError, "%i".__mod__, "aa")
Walter Dörwald28256f22003-01-19 16:59:20 +00001300
1301 # formatting jobs delegated from the string implementation:
Walter Dörwald28256f22003-01-19 16:59:20 +00001302 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001303 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1304 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1305 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1306 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1307 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1308 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
1309 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
1310 self.assertEqual('...%s...' % "abc", '...abc...')
1311 self.assertEqual('%*s' % (5,'abc',), ' abc')
1312 self.assertEqual('%*s' % (-5,'abc',), 'abc ')
1313 self.assertEqual('%*.*s' % (5,2,'abc',), ' ab')
1314 self.assertEqual('%*.*s' % (5,3,'abc',), ' abc')
1315 self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10 abc')
1316 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103 abc')
1317 self.assertEqual('%c' % 'a', 'a')
Neil Schemenauercf52c072005-08-12 17:34:58 +00001318 class Wrapper:
1319 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001320 return '\u1234'
1321 self.assertEqual('%s' % Wrapper(), '\u1234')
Walter Dörwald28256f22003-01-19 16:59:20 +00001322
Eric Smith741191f2009-05-06 13:08:15 +00001323 # issue 3382
1324 NAN = float('nan')
1325 INF = float('inf')
1326 self.assertEqual('%f' % NAN, 'nan')
1327 self.assertEqual('%F' % NAN, 'NAN')
1328 self.assertEqual('%f' % INF, 'inf')
1329 self.assertEqual('%F' % INF, 'INF')
1330
Victor Stinnerf59c28c2012-05-09 03:24:14 +02001331 # PEP 393
1332 self.assertEqual('%.1s' % "a\xe9\u20ac", 'a')
1333 self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9')
1334
Ethan Furmandf3ed242014-01-05 06:50:30 -08001335 #issue 19995
Ethan Furman9ab74802014-03-21 06:38:46 -07001336 class PseudoInt:
Ethan Furmandf3ed242014-01-05 06:50:30 -08001337 def __init__(self, value):
1338 self.value = int(value)
1339 def __int__(self):
1340 return self.value
1341 def __index__(self):
1342 return self.value
Ethan Furman9ab74802014-03-21 06:38:46 -07001343 class PseudoFloat:
Ethan Furmandf3ed242014-01-05 06:50:30 -08001344 def __init__(self, value):
1345 self.value = float(value)
1346 def __int__(self):
1347 return int(self.value)
Ethan Furman9ab74802014-03-21 06:38:46 -07001348 pi = PseudoFloat(3.1415)
1349 letter_m = PseudoInt(109)
Antoine Pitroueb168042014-01-10 00:02:38 +01001350 self.assertEqual('%x' % 42, '2a')
1351 self.assertEqual('%X' % 15, 'F')
1352 self.assertEqual('%o' % 9, '11')
1353 self.assertEqual('%c' % 109, 'm')
1354 self.assertEqual('%x' % letter_m, '6d')
1355 self.assertEqual('%X' % letter_m, '6D')
1356 self.assertEqual('%o' % letter_m, '155')
1357 self.assertEqual('%c' % letter_m, 'm')
Ethan Furman9ab74802014-03-21 06:38:46 -07001358 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14),
1359 self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11),
1360 self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79),
1361 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi),
1362 self.assertRaises(TypeError, operator.mod, '%c', pi),
Ethan Furmandf3ed242014-01-05 06:50:30 -08001363
Ethan Furmanfb137212013-08-31 10:18:55 -07001364 def test_formatting_with_enum(self):
1365 # issue18780
1366 import enum
1367 class Float(float, enum.Enum):
1368 PI = 3.1415926
1369 class Int(enum.IntEnum):
1370 IDES = 15
1371 class Str(str, enum.Enum):
1372 ABC = 'abc'
1373 # Testing Unicode formatting strings...
Ethan Furman13bdfa72013-08-31 12:48:51 -07001374 self.assertEqual("%s, %s" % (Str.ABC, Str.ABC),
1375 'Str.ABC, Str.ABC')
1376 self.assertEqual("%s, %s, %d, %i, %u, %f, %5.2f" %
1377 (Str.ABC, Str.ABC,
1378 Int.IDES, Int.IDES, Int.IDES,
1379 Float.PI, Float.PI),
1380 'Str.ABC, Str.ABC, 15, 15, 15, 3.141593, 3.14')
Ethan Furmanfb137212013-08-31 10:18:55 -07001381
1382 # formatting jobs delegated from the string implementation:
Ethan Furman13bdfa72013-08-31 12:48:51 -07001383 self.assertEqual('...%(foo)s...' % {'foo':Str.ABC},
1384 '...Str.ABC...')
1385 self.assertEqual('...%(foo)s...' % {'foo':Int.IDES},
1386 '...Int.IDES...')
1387 self.assertEqual('...%(foo)i...' % {'foo':Int.IDES},
1388 '...15...')
1389 self.assertEqual('...%(foo)d...' % {'foo':Int.IDES},
1390 '...15...')
1391 self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI},
1392 '...15...')
1393 self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123},
1394 '...3.141593...')
Ethan Furmanfb137212013-08-31 10:18:55 -07001395
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001396 def test_formatting_huge_precision(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001397 format_string = "%.{}f".format(sys.maxsize + 1)
1398 with self.assertRaises(ValueError):
1399 result = format_string % 2.34
1400
1401 @support.cpython_only
1402 def test_formatting_huge_precision_c_limits(self):
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001403 from _testcapi import INT_MAX
1404 format_string = "%.{}f".format(INT_MAX + 1)
1405 with self.assertRaises(ValueError):
1406 result = format_string % 2.34
1407
1408 def test_formatting_huge_width(self):
1409 format_string = "%{}f".format(sys.maxsize + 1)
1410 with self.assertRaises(ValueError):
1411 result = format_string % 2.34
1412
Ezio Melottiba42fd52011-04-26 06:09:45 +03001413 def test_startswith_endswith_errors(self):
1414 for meth in ('foo'.startswith, 'foo'.endswith):
Ezio Melottif2b3f782011-04-26 06:40:59 +03001415 with self.assertRaises(TypeError) as cm:
Ezio Melottiba42fd52011-04-26 06:09:45 +03001416 meth(['f'])
Ezio Melottif2b3f782011-04-26 06:40:59 +03001417 exc = str(cm.exception)
Ezio Melottiba42fd52011-04-26 06:09:45 +03001418 self.assertIn('str', exc)
1419 self.assertIn('tuple', exc)
1420
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001421 @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
Georg Brandlda6b1072006-01-20 17:48:54 +00001422 def test_format_float(self):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001423 # should not format with a comma, but always with C locale
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001424 self.assertEqual('1.0', '%.1f' % 1.0)
Georg Brandlda6b1072006-01-20 17:48:54 +00001425
Walter Dörwald28256f22003-01-19 16:59:20 +00001426 def test_constructor(self):
1427 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1428
1429 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001430 str('unicode remains unicode'),
1431 'unicode remains unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001432 )
1433
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001434 class UnicodeSubclass(str):
Marc-André Lemburg79f57832002-12-29 19:44:06 +00001435 pass
Guido van Rossuma831cac2000-03-10 23:23:21 +00001436
Victor Stinner07ac3eb2011-10-01 16:16:43 +02001437 for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
1438 subclass = UnicodeSubclass(text)
1439 self.assertEqual(str(subclass), text)
1440 self.assertEqual(len(subclass), len(text))
1441 if text == 'ascii':
1442 self.assertEqual(subclass.encode('ascii'), b'ascii')
1443 self.assertEqual(subclass.encode('utf-8'), b'ascii')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001444
Walter Dörwald28256f22003-01-19 16:59:20 +00001445 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001446 str('strings are converted to unicode'),
1447 'strings are converted to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001448 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001449
Walter Dörwald28256f22003-01-19 16:59:20 +00001450 class StringCompat:
1451 def __init__(self, x):
1452 self.x = x
1453 def __str__(self):
1454 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001455
Walter Dörwald28256f22003-01-19 16:59:20 +00001456 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001457 str(StringCompat('__str__ compatible objects are recognized')),
1458 '__str__ compatible objects are recognized'
Walter Dörwald28256f22003-01-19 16:59:20 +00001459 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001460
Walter Dörwald28256f22003-01-19 16:59:20 +00001461 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001462
Walter Dörwald28256f22003-01-19 16:59:20 +00001463 o = StringCompat('unicode(obj) is compatible to str()')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001464 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Walter Dörwald28256f22003-01-19 16:59:20 +00001465 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001466
Guido van Rossume2a383d2007-01-15 16:59:06 +00001467 for obj in (123, 123.45, 123):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001468 self.assertEqual(str(obj), str(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001469
Walter Dörwald28256f22003-01-19 16:59:20 +00001470 # unicode(obj, encoding, error) tests (this maps to
1471 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001472
Walter Dörwald28256f22003-01-19 16:59:20 +00001473 if not sys.platform.startswith('java'):
1474 self.assertRaises(
1475 TypeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001476 str,
1477 'decoding unicode is not supported',
Walter Dörwald28256f22003-01-19 16:59:20 +00001478 'utf-8',
1479 'strict'
1480 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001481
Walter Dörwald28256f22003-01-19 16:59:20 +00001482 self.assertEqual(
Walter Dörwald67e83882007-05-05 12:26:27 +00001483 str(b'strings are decoded to unicode', 'utf-8', 'strict'),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001484 'strings are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001485 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001486
Walter Dörwald28256f22003-01-19 16:59:20 +00001487 if not sys.platform.startswith('java'):
1488 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001489 str(
Guido van Rossumbae07c92007-10-08 02:46:15 +00001490 memoryview(b'character buffers are decoded to unicode'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001491 'utf-8',
1492 'strict'
1493 ),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001494 'character buffers are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001495 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001496
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001497 self.assertRaises(TypeError, str, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001498
Chris Jerdonek5fae0e52012-11-20 17:45:51 -08001499 def test_constructor_keyword_args(self):
1500 """Pass various keyword argument combinations to the constructor."""
1501 # The object argument can be passed as a keyword.
1502 self.assertEqual(str(object='foo'), 'foo')
1503 self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo')
1504 # The errors argument without encoding triggers "decode" mode.
1505 self.assertEqual(str(b'foo', errors='strict'), 'foo') # not "b'foo'"
1506 self.assertEqual(str(object=b'foo', errors='strict'), 'foo')
1507
1508 def test_constructor_defaults(self):
1509 """Check the constructor argument defaults."""
1510 # The object argument defaults to '' or b''.
1511 self.assertEqual(str(), '')
1512 self.assertEqual(str(errors='strict'), '')
1513 utf8_cent = '¢'.encode('utf-8')
1514 # The encoding argument defaults to utf-8.
1515 self.assertEqual(str(utf8_cent, errors='strict'), '¢')
1516 # The errors argument defaults to strict.
1517 self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii')
1518
Walter Dörwald28256f22003-01-19 16:59:20 +00001519 def test_codecs_utf7(self):
1520 utfTests = [
Walter Dörwald67e83882007-05-05 12:26:27 +00001521 ('A\u2262\u0391.', b'A+ImIDkQ.'), # RFC2152 example
1522 ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'), # RFC2152 example
1523 ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'), # RFC2152 example
1524 ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1525 ('+', b'+-'),
1526 ('+-', b'+--'),
1527 ('+?', b'+-?'),
1528 ('\?', b'+AFw?'),
1529 ('+?', b'+-?'),
1530 (r'\\?', b'+AFwAXA?'),
1531 (r'\\\?', b'+AFwAXABc?'),
Antoine Pitrou244651a2009-05-04 18:56:13 +00001532 (r'++--', b'+-+---'),
1533 ('\U000abcde', b'+2m/c3g-'), # surrogate pairs
1534 ('/', b'/'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001535 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001536
Walter Dörwald28256f22003-01-19 16:59:20 +00001537 for (x, y) in utfTests:
1538 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001539
Antoine Pitrou5418ee02011-11-15 01:42:21 +01001540 # Unpaired surrogates are passed through
1541 self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
1542 self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
1543 self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
1544 self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
1545 self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
1546 self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
1547 self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
1548 self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001549
Antoine Pitrou5418ee02011-11-15 01:42:21 +01001550 self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
1551 self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001552
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001553 # Issue #2242: crash on some Windows/MSVC versions
Antoine Pitrou244651a2009-05-04 18:56:13 +00001554 self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1')
1555
1556 # Direct encoded characters
1557 set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1558 # Optional direct characters
1559 set_o = '!"#$%&*;<=>@[]^_`{|}'
1560 for c in set_d:
1561 self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1562 self.assertEqual(c.encode('ascii').decode('utf7'), c)
1563 for c in set_o:
1564 self.assertEqual(c.encode('ascii').decode('utf7'), c)
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001565
Walter Dörwald28256f22003-01-19 16:59:20 +00001566 def test_codecs_utf8(self):
Walter Dörwald67e83882007-05-05 12:26:27 +00001567 self.assertEqual(''.encode('utf-8'), b'')
1568 self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
Ezio Melottia9860ae2011-10-04 19:06:00 +03001569 self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82')
1570 self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96')
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001571 self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1572 self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
Ezio Melottia9860ae2011-10-04 19:06:00 +03001573 self.assertEqual(('\U00010002'*10).encode('utf-8'),
1574 b'\xf0\x90\x80\x82'*10)
Walter Dörwald28256f22003-01-19 16:59:20 +00001575 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001576 '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1577 '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1578 '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1579 '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1580 '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1581 ' Nunstuck git und'.encode('utf-8'),
Walter Dörwald67e83882007-05-05 12:26:27 +00001582 b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1583 b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1584 b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1585 b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1586 b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1587 b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1588 b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1589 b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1590 b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1591 b'\xe3\x80\x8cWenn ist das Nunstuck git und'
Walter Dörwald28256f22003-01-19 16:59:20 +00001592 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001593
Walter Dörwald28256f22003-01-19 16:59:20 +00001594 # UTF-8 specific decoding tests
Walter Dörwald67e83882007-05-05 12:26:27 +00001595 self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1596 self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1597 self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001598
Walter Dörwald28256f22003-01-19 16:59:20 +00001599 # Other possible utf-8 test cases:
1600 # * strict decoding testing for all of the
1601 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001602
Ezio Melotti57221d02010-07-01 07:32:02 +00001603 def test_utf8_decode_valid_sequences(self):
1604 sequences = [
1605 # single byte
1606 (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1607 # 2 bytes
1608 (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1609 # 3 bytes
1610 (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1611 (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1612 # 4 bytes
1613 (b'\xF0\x90\x80\x80', '\U00010000'),
1614 (b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1615 ]
1616 for seq, res in sequences:
1617 self.assertEqual(seq.decode('utf-8'), res)
1618
1619
1620 def test_utf8_decode_invalid_sequences(self):
1621 # continuation bytes in a sequence of 2, 3, or 4 bytes
1622 continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001623 # start bytes of a 2-byte sequence equivalent to code points < 0x7F
Ezio Melotti57221d02010-07-01 07:32:02 +00001624 invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001625 # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
Ezio Melotti57221d02010-07-01 07:32:02 +00001626 invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1627 invalid_start_bytes = (
1628 continuation_bytes + invalid_2B_seq_start_bytes +
1629 invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1630 )
1631
1632 for byte in invalid_start_bytes:
1633 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1634
1635 for sb in invalid_2B_seq_start_bytes:
1636 for cb in continuation_bytes:
1637 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1638
1639 for sb in invalid_4B_seq_start_bytes:
1640 for cb1 in continuation_bytes[:3]:
1641 for cb3 in continuation_bytes[:3]:
1642 self.assertRaises(UnicodeDecodeError,
1643 (sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1644
1645 for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1646 self.assertRaises(UnicodeDecodeError,
1647 (b'\xE0'+cb+b'\x80').decode, 'utf-8')
1648 self.assertRaises(UnicodeDecodeError,
1649 (b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1650 # surrogates
1651 for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1652 self.assertRaises(UnicodeDecodeError,
1653 (b'\xED'+cb+b'\x80').decode, 'utf-8')
1654 self.assertRaises(UnicodeDecodeError,
1655 (b'\xED'+cb+b'\xBF').decode, 'utf-8')
1656 for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1657 self.assertRaises(UnicodeDecodeError,
1658 (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1659 self.assertRaises(UnicodeDecodeError,
1660 (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1661 for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1662 self.assertRaises(UnicodeDecodeError,
1663 (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1664 self.assertRaises(UnicodeDecodeError,
1665 (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1666
1667 def test_issue8271(self):
1668 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1669 # only the start byte and the continuation byte(s) are now considered
1670 # invalid, instead of the number of bytes specified by the start byte.
1671 # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1672 # table 3-8, Row 2) for more information about the algorithm used.
1673 FFFD = '\ufffd'
1674 sequences = [
1675 # invalid start bytes
1676 (b'\x80', FFFD), # continuation byte
1677 (b'\x80\x80', FFFD*2), # 2 continuation bytes
1678 (b'\xc0', FFFD),
1679 (b'\xc0\xc0', FFFD*2),
1680 (b'\xc1', FFFD),
1681 (b'\xc1\xc0', FFFD*2),
1682 (b'\xc0\xc1', FFFD*2),
1683 # with start byte of a 2-byte sequence
1684 (b'\xc2', FFFD), # only the start byte
1685 (b'\xc2\xc2', FFFD*2), # 2 start bytes
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001686 (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes
Ezio Melotti57221d02010-07-01 07:32:02 +00001687 (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1688 # with start byte of a 3-byte sequence
1689 (b'\xe1', FFFD), # only the start byte
1690 (b'\xe1\xe1', FFFD*2), # 2 start bytes
1691 (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1692 (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1693 (b'\xe1\x80', FFFD), # only 1 continuation byte
1694 (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1695 (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1696 (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1697 (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1698 (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1699 (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1700 # with start byte of a 4-byte sequence
1701 (b'\xf1', FFFD), # only the start byte
1702 (b'\xf1\xf1', FFFD*2), # 2 start bytes
1703 (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1704 (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1705 (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1706 (b'\xf1\x80', FFFD), # only 1 continuation bytes
1707 (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1708 (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1709 (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1710 (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1711 (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1712 (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1713 (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1714 (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1715 (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1716 (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1717 (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1718 (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1719 # with invalid start byte of a 4-byte sequence (rfc2279)
1720 (b'\xf5', FFFD), # only the start byte
1721 (b'\xf5\xf5', FFFD*2), # 2 start bytes
1722 (b'\xf5\x80', FFFD*2), # only 1 continuation byte
1723 (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1724 (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1725 (b'\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
1726 (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1727 (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1728 # with invalid start byte of a 5-byte sequence (rfc2279)
1729 (b'\xf8', FFFD), # only the start byte
1730 (b'\xf8\xf8', FFFD*2), # 2 start bytes
1731 (b'\xf8\x80', FFFD*2), # only one continuation byte
1732 (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1733 (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1734 # with invalid start byte of a 6-byte sequence (rfc2279)
1735 (b'\xfc', FFFD), # only the start byte
1736 (b'\xfc\xfc', FFFD*2), # 2 start bytes
1737 (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1738 (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1739 # invalid start byte
1740 (b'\xfe', FFFD),
1741 (b'\xfe\x80\x80', FFFD*3),
1742 # other sequences
1743 (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1744 (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1745 (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1746 (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1747 '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1748 ]
1749 for n, (seq, res) in enumerate(sequences):
1750 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1751 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1752 self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1753 self.assertEqual(seq.decode('utf-8', 'ignore'),
1754 res.replace('\uFFFD', ''))
1755
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001756 def to_bytestring(self, seq):
1757 return bytes(int(c, 16) for c in seq.split())
1758
1759 def assertCorrectUTF8Decoding(self, seq, res, err):
1760 """
1761 Check that an invalid UTF-8 sequence raises an UnicodeDecodeError when
1762 'strict' is used, returns res when 'replace' is used, and that doesn't
1763 return anything when 'ignore' is used.
1764 """
1765 with self.assertRaises(UnicodeDecodeError) as cm:
1766 seq.decode('utf-8')
1767 exc = cm.exception
1768
1769 self.assertIn(err, str(exc))
1770 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1771 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'),
1772 'aaaa' + res + 'bbbb')
1773 res = res.replace('\ufffd', '')
1774 self.assertEqual(seq.decode('utf-8', 'ignore'), res)
1775 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'),
1776 'aaaa' + res + 'bbbb')
1777
1778 def test_invalid_start_byte(self):
1779 """
1780 Test that an 'invalid start byte' error is raised when the first byte
1781 is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
1782 4-bytes sequence. The invalid start byte is replaced with a single
1783 U+FFFD when errors='replace'.
1784 E.g. <80> is a continuation byte and can appear only after a start byte.
1785 """
1786 FFFD = '\ufffd'
1787 for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
1788 self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd',
1789 'invalid start byte')
1790
1791 def test_unexpected_end_of_data(self):
1792 """
1793 Test that an 'unexpected end of data' error is raised when the string
1794 ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
1795 enough continuation bytes. The incomplete sequence is replaced with a
1796 single U+FFFD when errors='replace'.
1797 E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
1798 sequence, but it's followed by only 2 valid continuation bytes and the
1799 last continuation bytes is missing.
1800 Note: the continuation bytes must be all valid, if one of them is
1801 invalid another error will be raised.
1802 """
1803 sequences = [
1804 'C2', 'DF',
1805 'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
1806 'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
1807 'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
1808 'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
1809 'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
1810 'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
1811 ]
1812 FFFD = '\ufffd'
1813 for seq in sequences:
1814 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), '\ufffd',
1815 'unexpected end of data')
1816
1817 def test_invalid_cb_for_2bytes_seq(self):
1818 """
1819 Test that an 'invalid continuation byte' error is raised when the
1820 continuation byte of a 2-bytes sequence is invalid. The start byte
1821 is replaced by a single U+FFFD and the second byte is handled
1822 separately when errors='replace'.
1823 E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
1824 sequence, but 41 is not a valid continuation byte because it's the
1825 ASCII letter 'A'.
1826 """
1827 FFFD = '\ufffd'
1828 FFFDx2 = FFFD * 2
1829 sequences = [
1830 ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'),
1831 ('C2 C0', FFFDx2), ('C2 FF', FFFDx2),
1832 ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'),
1833 ('DF C0', FFFDx2), ('DF FF', FFFDx2),
1834 ]
1835 for seq, res in sequences:
1836 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
1837 'invalid continuation byte')
1838
1839 def test_invalid_cb_for_3bytes_seq(self):
1840 """
1841 Test that an 'invalid continuation byte' error is raised when the
1842 continuation byte(s) of a 3-bytes sequence are invalid. When
1843 errors='replace', if the first continuation byte is valid, the first
1844 two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
1845 third byte is handled separately, otherwise only the start byte is
1846 replaced with a U+FFFD and the other continuation bytes are handled
1847 separately.
1848 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1849 sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1850 because it's the ASCII letter 'A'.
1851 Note: when the start byte is E0 or ED, the valid ranges for the first
1852 continuation byte are limited to A0..BF and 80..9F respectively.
1853 Python 2 used to consider all the bytes in range 80..BF valid when the
1854 start byte was ED. This is fixed in Python 3.
1855 """
1856 FFFD = '\ufffd'
1857 FFFDx2 = FFFD * 2
1858 sequences = [
1859 ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2),
1860 ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
1861 ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'),
1862 ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
1863 ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'),
1864 ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'),
1865 ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
1866 ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'),
1867 ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
1868 ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'),
1869 ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'),
1870 ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
1871 ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'),
1872 ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
1873 ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'),
1874 ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'),
1875 ('ED 7F', FFFD+'\x7f'),
1876 ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
1877 ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'),
1878 ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2),
1879 ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'),
1880 ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2),
1881 ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'),
1882 ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
1883 ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'),
1884 ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
1885 ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'),
1886 ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'),
1887 ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
1888 ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'),
1889 ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
1890 ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'),
1891 ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
1892 ]
1893 for seq, res in sequences:
1894 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
1895 'invalid continuation byte')
1896
1897 def test_invalid_cb_for_4bytes_seq(self):
1898 """
1899 Test that an 'invalid continuation byte' error is raised when the
1900 continuation byte(s) of a 4-bytes sequence are invalid. When
1901 errors='replace',the start byte and all the following valid
1902 continuation bytes are replaced with a single U+FFFD, and all the bytes
1903 starting from the first invalid continuation bytes (included) are
1904 handled separately.
1905 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1906 sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1907 because it's the ASCII letter 'A'.
1908 Note: when the start byte is E0 or ED, the valid ranges for the first
1909 continuation byte are limited to A0..BF and 80..9F respectively.
1910 However, when the start byte is ED, Python 2 considers all the bytes
1911 in range 80..BF valid. This is fixed in Python 3.
1912 """
1913 FFFD = '\ufffd'
1914 FFFDx2 = FFFD * 2
1915 sequences = [
1916 ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2),
1917 ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
1918 ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'),
1919 ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
1920 ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'),
1921 ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
1922 ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'),
1923 ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
1924 ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'),
1925 ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
1926 ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'),
1927 ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
1928 ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'),
1929 ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
1930 ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2),
1931 ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'),
1932 ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2),
1933 ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'),
1934 ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2),
1935 ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'),
1936 ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2),
1937 ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'),
1938 ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2),
1939 ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'),
1940 ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2),
1941 ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'),
1942 ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2),
1943 ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'),
1944 ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
1945 ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'),
1946 ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
1947 ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'),
1948 ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
1949 ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'),
1950 ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
1951 ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'),
1952 ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
1953 ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'),
1954 ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
1955 ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'),
1956 ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
1957 ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2),
1958 ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
1959 ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'),
1960 ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
1961 ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'),
1962 ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
1963 ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'),
1964 ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
1965 ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'),
1966 ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
1967 ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'),
1968 ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
1969 ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'),
1970 ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
1971 ]
1972 for seq, res in sequences:
1973 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
1974 'invalid continuation byte')
1975
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00001976 def test_codecs_idna(self):
1977 # Test whether trailing dot is preserved
Walter Dörwald1324c6f2007-05-11 19:57:05 +00001978 self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00001979
Walter Dörwald28256f22003-01-19 16:59:20 +00001980 def test_codecs_errors(self):
1981 # Error handling (encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001982 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
1983 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
Walter Dörwald67e83882007-05-05 12:26:27 +00001984 self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
1985 self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
Benjamin Peterson308d6372009-09-18 21:42:35 +00001986 self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
1987 'Andr\202 x'.encode('ascii', errors='replace'))
1988 self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
1989 'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001990
Walter Dörwald28256f22003-01-19 16:59:20 +00001991 # Error handling (decoding)
Walter Dörwald67e83882007-05-05 12:26:27 +00001992 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
1993 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
1994 self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
1995 self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001996
Walter Dörwald28256f22003-01-19 16:59:20 +00001997 # Error handling (unknown character names)
Guido van Rossum39478e82007-08-27 17:23:59 +00001998 self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001999
Walter Dörwald28256f22003-01-19 16:59:20 +00002000 # Error handling (truncated escape sequence)
Guido van Rossum9c627722007-08-27 18:31:48 +00002001 self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00002002
Guido van Rossum9c627722007-08-27 18:31:48 +00002003 self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
2004 self.assertRaises(TypeError, str, b"hello", "test.unicode2")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002005 self.assertRaises(TypeError, "hello".encode, "test.unicode1")
2006 self.assertRaises(TypeError, "hello".encode, "test.unicode2")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00002007
Walter Dörwald28256f22003-01-19 16:59:20 +00002008 # Error handling (wrong arguments)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002009 self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002010
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00002011 # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00002012 self.assertRaises(UnicodeError, float, "\ud800")
2013 self.assertRaises(UnicodeError, float, "\udf00")
2014 self.assertRaises(UnicodeError, complex, "\ud800")
2015 self.assertRaises(UnicodeError, complex, "\udf00")
Guido van Rossum97064862000-04-10 13:52:48 +00002016
Walter Dörwald28256f22003-01-19 16:59:20 +00002017 def test_codecs(self):
2018 # Encoding
Walter Dörwald67e83882007-05-05 12:26:27 +00002019 self.assertEqual('hello'.encode('ascii'), b'hello')
2020 self.assertEqual('hello'.encode('utf-7'), b'hello')
2021 self.assertEqual('hello'.encode('utf-8'), b'hello')
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002022 self.assertEqual('hello'.encode('utf-8'), b'hello')
Walter Dörwald67e83882007-05-05 12:26:27 +00002023 self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
2024 self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
2025 self.assertEqual('hello'.encode('latin-1'), b'hello')
Guido van Rossum97064862000-04-10 13:52:48 +00002026
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002027 # Default encoding is utf-8
2028 self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
2029
Walter Dörwald28256f22003-01-19 16:59:20 +00002030 # Roundtrip safety for BMP (just the first 1024 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002031 for c in range(1024):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002032 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002033 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
2034 'utf-16-be', 'raw_unicode_escape',
2035 'unicode_escape', 'unicode_internal'):
Victor Stinner040e16e2011-11-15 22:44:05 +01002036 with warnings.catch_warnings():
2037 # unicode-internal has been deprecated
2038 warnings.simplefilter("ignore", DeprecationWarning)
2039
2040 self.assertEqual(str(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002041
Walter Dörwald28256f22003-01-19 16:59:20 +00002042 # Roundtrip safety for BMP (just the first 256 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002043 for c in range(256):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002044 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002045 for encoding in ('latin-1',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002046 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002047
Walter Dörwald28256f22003-01-19 16:59:20 +00002048 # Roundtrip safety for BMP (just the first 128 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002049 for c in range(128):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002050 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002051 for encoding in ('ascii',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002052 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002053
Walter Dörwald28256f22003-01-19 16:59:20 +00002054 # Roundtrip safety for non-BMP (just a few chars)
Victor Stinner040e16e2011-11-15 22:44:05 +01002055 with warnings.catch_warnings():
2056 # unicode-internal has been deprecated
2057 warnings.simplefilter("ignore", DeprecationWarning)
2058
2059 u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
2060 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
2061 'raw_unicode_escape',
2062 'unicode_escape', 'unicode_internal'):
2063 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002064
Antoine Pitrou51f66482011-11-11 13:35:44 +01002065 # UTF-8 must be roundtrip safe for all code points
2066 # (except surrogates, which are forbidden).
2067 u = ''.join(map(chr, list(range(0, 0xd800)) +
Ezio Melotti40dc9192011-11-11 17:00:46 +02002068 list(range(0xe000, 0x110000))))
Walter Dörwald28256f22003-01-19 16:59:20 +00002069 for encoding in ('utf-8',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002070 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002071
Walter Dörwald28256f22003-01-19 16:59:20 +00002072 def test_codecs_charmap(self):
2073 # 0-127
Guido van Rossum805365e2007-05-07 22:24:25 +00002074 s = bytes(range(128))
Walter Dörwald28256f22003-01-19 16:59:20 +00002075 for encoding in (
Andrew Kuchlingad8156e2013-11-10 13:44:30 -05002076 'cp037', 'cp1026', 'cp273',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00002077 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2078 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02002079 'cp863', 'cp865', 'cp866', 'cp1125',
Walter Dörwald28256f22003-01-19 16:59:20 +00002080 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2081 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
2082 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
2083 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002084
Walter Dörwald28256f22003-01-19 16:59:20 +00002085 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2086 'cp1256', 'cp1257', 'cp1258',
2087 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002088
Walter Dörwald28256f22003-01-19 16:59:20 +00002089 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2090 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +00002091
Walter Dörwald28256f22003-01-19 16:59:20 +00002092 ### These have undefined mappings:
2093 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +00002094
Walter Dörwald28256f22003-01-19 16:59:20 +00002095 ### These fail the round-trip:
2096 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +00002097
Walter Dörwald28256f22003-01-19 16:59:20 +00002098 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002099 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002100
Walter Dörwald28256f22003-01-19 16:59:20 +00002101 # 128-255
Guido van Rossum805365e2007-05-07 22:24:25 +00002102 s = bytes(range(128, 256))
Walter Dörwald28256f22003-01-19 16:59:20 +00002103 for encoding in (
Andrew Kuchlingad8156e2013-11-10 13:44:30 -05002104 'cp037', 'cp1026', 'cp273',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00002105 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2106 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02002107 'cp863', 'cp865', 'cp866', 'cp1125',
Walter Dörwald28256f22003-01-19 16:59:20 +00002108 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2109 'iso8859_2', 'iso8859_4', 'iso8859_5',
2110 'iso8859_9', 'koi8_r', 'latin_1',
2111 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +00002112
Walter Dörwald28256f22003-01-19 16:59:20 +00002113 ### These have undefined mappings:
2114 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2115 #'cp1256', 'cp1257', 'cp1258',
2116 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
2117 #'iso8859_3', 'iso8859_6', 'iso8859_7',
2118 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +00002119
Walter Dörwald28256f22003-01-19 16:59:20 +00002120 ### These fail the round-trip:
2121 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +00002122
Walter Dörwald28256f22003-01-19 16:59:20 +00002123 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002124 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002125
Walter Dörwald28256f22003-01-19 16:59:20 +00002126 def test_concatenation(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002127 self.assertEqual(("abc" "def"), "abcdef")
2128 self.assertEqual(("abc" "def"), "abcdef")
2129 self.assertEqual(("abc" "def"), "abcdef")
2130 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2131 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +00002132
Walter Dörwald28256f22003-01-19 16:59:20 +00002133 def test_printing(self):
2134 class BitBucket:
2135 def write(self, text):
2136 pass
Fred Drake004d5e62000-10-23 17:22:08 +00002137
Walter Dörwald28256f22003-01-19 16:59:20 +00002138 out = BitBucket()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002139 print('abc', file=out)
2140 print('abc', 'def', file=out)
2141 print('abc', 'def', file=out)
2142 print('abc', 'def', file=out)
2143 print('abc\n', file=out)
2144 print('abc\n', end=' ', file=out)
2145 print('abc\n', end=' ', file=out)
2146 print('def\n', file=out)
2147 print('def\n', file=out)
Fred Drake004d5e62000-10-23 17:22:08 +00002148
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002149 def test_ucs4(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002150 x = '\U00100000'
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002151 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
2152 self.assertEqual(x, y)
2153
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002154 y = br'\U00100000'
2155 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2156 self.assertEqual(x, y)
2157 y = br'\U00010000'
2158 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2159 self.assertEqual(x, y)
Christian Heimesfe337bf2008-03-23 21:54:12 +00002160
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002161 try:
2162 br'\U11111111'.decode("raw-unicode-escape")
2163 except UnicodeDecodeError as e:
2164 self.assertEqual(e.start, 0)
2165 self.assertEqual(e.end, 10)
2166 else:
2167 self.fail("Should have raised UnicodeDecodeError")
Christian Heimesfe337bf2008-03-23 21:54:12 +00002168
Brett Cannonc3647ac2005-04-26 03:45:26 +00002169 def test_conversion(self):
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002170 # Make sure __str__() works properly
2171 class ObjectToStr:
Brett Cannonc3647ac2005-04-26 03:45:26 +00002172 def __str__(self):
2173 return "foo"
2174
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002175 class StrSubclassToStr(str):
Guido van Rossum98297ee2007-11-06 21:34:58 +00002176 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002177 return "foo"
Brett Cannonc3647ac2005-04-26 03:45:26 +00002178
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002179 class StrSubclassToStrSubclass(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002180 def __new__(cls, content=""):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002181 return str.__new__(cls, 2*content)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002182 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002183 return self
2184
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002185 self.assertEqual(str(ObjectToStr()), "foo")
2186 self.assertEqual(str(StrSubclassToStr("bar")), "foo")
2187 s = str(StrSubclassToStrSubclass("foo"))
2188 self.assertEqual(s, "foofoo")
2189 self.assertIs(type(s), StrSubclassToStrSubclass)
Brett Cannonc3647ac2005-04-26 03:45:26 +00002190
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002191 def test_unicode_repr(self):
2192 class s1:
2193 def __repr__(self):
2194 return '\\n'
2195
2196 class s2:
2197 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002198 return '\\n'
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002199
2200 self.assertEqual(repr(s1()), '\\n')
2201 self.assertEqual(repr(s2()), '\\n')
2202
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002203 def test_printable_repr(self):
2204 self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
Martin v. Löwisbaecd722010-10-11 22:42:28 +00002205 self.assertEqual(repr('\U00014000'), "'\\U00014000'") # nonprintable
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002206
Zachary Ware9fe6d862013-12-08 00:20:35 -06002207 # This test only affects 32-bit platforms because expandtabs can only take
2208 # an int as the max value, not a 64-bit C long. If expandtabs is changed
2209 # to take a 64-bit long, this test should apply to all platforms.
2210 @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4,
2211 'only applies to 32-bit platforms')
Guido van Rossumcd16bf62007-06-13 18:07:49 +00002212 def test_expandtabs_overflows_gracefully(self):
Christian Heimesa37d4c62007-12-04 23:02:19 +00002213 self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002214
Victor Stinner1d972ad2011-10-07 13:31:46 +02002215 @support.cpython_only
Antoine Pitroue19aa382011-10-04 16:04:01 +02002216 def test_expandtabs_optimization(self):
2217 s = 'abc'
2218 self.assertIs(s.expandtabs(), s)
2219
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00002220 def test_raiseMemError(self):
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 if struct.calcsize('P') == 8:
2222 # 64 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02002223 ascii_struct_size = 48
2224 compact_struct_size = 72
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 else:
2226 # 32 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02002227 ascii_struct_size = 24
2228 compact_struct_size = 36
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229
2230 for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
2231 code = ord(char)
2232 if code < 0x100:
2233 char_size = 1 # sizeof(Py_UCS1)
2234 struct_size = ascii_struct_size
2235 elif code < 0x10000:
2236 char_size = 2 # sizeof(Py_UCS2)
2237 struct_size = compact_struct_size
2238 else:
2239 char_size = 4 # sizeof(Py_UCS4)
2240 struct_size = compact_struct_size
2241 # Note: sys.maxsize is half of the actual max allocation because of
Martin v. Löwis287eca62011-09-28 10:03:28 +02002242 # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
2243 # be allocatable, given enough memory.
2244 maxlen = ((sys.maxsize - struct_size) // char_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245 alloc = lambda: char * maxlen
2246 self.assertRaises(MemoryError, alloc)
2247 self.assertRaises(MemoryError, alloc)
Antoine Pitrou3db3e872008-08-17 17:06:51 +00002248
Victor Stinner808fc0a2010-03-22 12:50:40 +00002249 def test_format_subclass(self):
2250 class S(str):
2251 def __str__(self):
2252 return '__str__ overridden'
2253 s = S('xxx')
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002254 self.assertEqual("%s" % s, '__str__ overridden')
2255 self.assertEqual("{}".format(s), '__str__ overridden')
Victor Stinner808fc0a2010-03-22 12:50:40 +00002256
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002257 # Test PyUnicode_FromFormat()
Victor Stinner1205f272010-09-11 00:54:47 +00002258 def test_from_format(self):
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002259 support.import_module('ctypes')
Victor Stinner15a11362012-10-06 23:48:20 +02002260 from ctypes import (
2261 pythonapi, py_object, sizeof,
Victor Stinner6d970f42011-03-02 00:04:25 +00002262 c_int, c_long, c_longlong, c_ssize_t,
Victor Stinner15a11362012-10-06 23:48:20 +02002263 c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264 name = "PyUnicode_FromFormat"
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002265 _PyUnicode_FromFormat = getattr(pythonapi, name)
2266 _PyUnicode_FromFormat.restype = py_object
2267
2268 def PyUnicode_FromFormat(format, *args):
2269 cargs = tuple(
2270 py_object(arg) if isinstance(arg, str) else arg
2271 for arg in args)
2272 return _PyUnicode_FromFormat(format, *cargs)
Victor Stinner1205f272010-09-11 00:54:47 +00002273
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002274 def check_format(expected, format, *args):
2275 text = PyUnicode_FromFormat(format, *args)
2276 self.assertEqual(expected, text)
2277
Victor Stinner1205f272010-09-11 00:54:47 +00002278 # ascii format, non-ascii argument
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002279 check_format('ascii\x7f=unicode\xe9',
2280 b'ascii\x7f=%U', 'unicode\xe9')
Victor Stinner1205f272010-09-11 00:54:47 +00002281
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002282 # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
2283 # raises an error
Ezio Melottied3a7d22010-12-01 02:32:32 +00002284 self.assertRaisesRegex(ValueError,
Victor Stinner1205f272010-09-11 00:54:47 +00002285 '^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
Victor Stinner4c7db312010-09-12 07:51:18 +00002286 'string, got a non-ASCII byte: 0xe9$',
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002287 PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00002288
Victor Stinner96865452011-03-01 23:44:09 +00002289 # test "%c"
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002290 check_format('\uabcd',
2291 b'%c', c_int(0xabcd))
2292 check_format('\U0010ffff',
2293 b'%c', c_int(0x10ffff))
Serhiy Storchaka8eeae212013-06-23 20:12:14 +03002294 with self.assertRaises(OverflowError):
2295 PyUnicode_FromFormat(b'%c', c_int(0x110000))
Serhiy Storchaka31b1c8b2013-06-12 09:20:44 +03002296 # Issue #18183
Serhiy Storchakaf15ffe02013-06-12 09:28:20 +03002297 check_format('\U00010000\U00100000',
2298 b'%c%c', c_int(0x10000), c_int(0x100000))
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002299
Victor Stinner96865452011-03-01 23:44:09 +00002300 # test "%"
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002301 check_format('%',
2302 b'%')
2303 check_format('%',
2304 b'%%')
2305 check_format('%s',
2306 b'%%s')
2307 check_format('[%]',
2308 b'[%%]')
2309 check_format('%abc',
2310 b'%%%s', b'abc')
2311
2312 # truncated string
2313 check_format('abc',
2314 b'%.3s', b'abcdef')
2315 check_format('abc[\ufffd',
2316 b'%.5s', 'abc[\u20ac]'.encode('utf8'))
2317 check_format("'\\u20acABC'",
2318 b'%A', '\u20acABC')
2319 check_format("'\\u20",
2320 b'%.5A', '\u20acABCDEF')
2321 check_format("'\u20acABC'",
2322 b'%R', '\u20acABC')
2323 check_format("'\u20acA",
2324 b'%.3R', '\u20acABCDEF')
2325 check_format('\u20acAB',
2326 b'%.3S', '\u20acABCDEF')
2327 check_format('\u20acAB',
2328 b'%.3U', '\u20acABCDEF')
2329 check_format('\u20acAB',
2330 b'%.3V', '\u20acABCDEF', None)
2331 check_format('abc[\ufffd',
2332 b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
2333
2334 # following tests comes from #7330
2335 # test width modifier and precision modifier with %S
2336 check_format("repr= abc",
2337 b'repr=%5S', 'abc')
2338 check_format("repr=ab",
2339 b'repr=%.2S', 'abc')
2340 check_format("repr= ab",
2341 b'repr=%5.2S', 'abc')
2342
2343 # test width modifier and precision modifier with %R
2344 check_format("repr= 'abc'",
2345 b'repr=%8R', 'abc')
2346 check_format("repr='ab",
2347 b'repr=%.3R', 'abc')
2348 check_format("repr= 'ab",
2349 b'repr=%5.3R', 'abc')
2350
2351 # test width modifier and precision modifier with %A
2352 check_format("repr= 'abc'",
2353 b'repr=%8A', 'abc')
2354 check_format("repr='ab",
2355 b'repr=%.3A', 'abc')
2356 check_format("repr= 'ab",
2357 b'repr=%5.3A', 'abc')
2358
2359 # test width modifier and precision modifier with %s
2360 check_format("repr= abc",
2361 b'repr=%5s', b'abc')
2362 check_format("repr=ab",
2363 b'repr=%.2s', b'abc')
2364 check_format("repr= ab",
2365 b'repr=%5.2s', b'abc')
2366
2367 # test width modifier and precision modifier with %U
2368 check_format("repr= abc",
2369 b'repr=%5U', 'abc')
2370 check_format("repr=ab",
2371 b'repr=%.2U', 'abc')
2372 check_format("repr= ab",
2373 b'repr=%5.2U', 'abc')
2374
2375 # test width modifier and precision modifier with %V
2376 check_format("repr= abc",
2377 b'repr=%5V', 'abc', b'123')
2378 check_format("repr=ab",
2379 b'repr=%.2V', 'abc', b'123')
2380 check_format("repr= ab",
2381 b'repr=%5.2V', 'abc', b'123')
2382 check_format("repr= 123",
2383 b'repr=%5V', None, b'123')
2384 check_format("repr=12",
2385 b'repr=%.2V', None, b'123')
2386 check_format("repr= 12",
2387 b'repr=%5.2V', None, b'123')
Victor Stinner96865452011-03-01 23:44:09 +00002388
Victor Stinner6d970f42011-03-02 00:04:25 +00002389 # test integer formats (%i, %d, %u)
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002390 check_format('010',
2391 b'%03i', c_int(10))
2392 check_format('0010',
2393 b'%0.4i', c_int(10))
2394 check_format('-123',
2395 b'%i', c_int(-123))
2396 check_format('-123',
2397 b'%li', c_long(-123))
2398 check_format('-123',
2399 b'%lli', c_longlong(-123))
2400 check_format('-123',
2401 b'%zi', c_ssize_t(-123))
Victor Stinner96865452011-03-01 23:44:09 +00002402
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002403 check_format('-123',
2404 b'%d', c_int(-123))
2405 check_format('-123',
2406 b'%ld', c_long(-123))
2407 check_format('-123',
2408 b'%lld', c_longlong(-123))
2409 check_format('-123',
2410 b'%zd', c_ssize_t(-123))
Victor Stinner96865452011-03-01 23:44:09 +00002411
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002412 check_format('123',
2413 b'%u', c_uint(123))
2414 check_format('123',
2415 b'%lu', c_ulong(123))
2416 check_format('123',
2417 b'%llu', c_ulonglong(123))
2418 check_format('123',
2419 b'%zu', c_size_t(123))
Victor Stinner6d970f42011-03-02 00:04:25 +00002420
Victor Stinner15a11362012-10-06 23:48:20 +02002421 # test long output
2422 min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
2423 max_longlong = -min_longlong - 1
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002424 check_format(str(min_longlong),
2425 b'%lld', c_longlong(min_longlong))
2426 check_format(str(max_longlong),
2427 b'%lld', c_longlong(max_longlong))
Victor Stinner15a11362012-10-06 23:48:20 +02002428 max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002429 check_format(str(max_ulonglong),
2430 b'%llu', c_ulonglong(max_ulonglong))
Victor Stinner15a11362012-10-06 23:48:20 +02002431 PyUnicode_FromFormat(b'%p', c_void_p(-1))
2432
Victor Stinnere215d962012-10-06 23:03:36 +02002433 # test padding (width and/or precision)
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002434 check_format('123'.rjust(10, '0'),
2435 b'%010i', c_int(123))
2436 check_format('123'.rjust(100),
2437 b'%100i', c_int(123))
2438 check_format('123'.rjust(100, '0'),
2439 b'%.100i', c_int(123))
2440 check_format('123'.rjust(80, '0').rjust(100),
2441 b'%100.80i', c_int(123))
Victor Stinnere215d962012-10-06 23:03:36 +02002442
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002443 check_format('123'.rjust(10, '0'),
2444 b'%010u', c_uint(123))
2445 check_format('123'.rjust(100),
2446 b'%100u', c_uint(123))
2447 check_format('123'.rjust(100, '0'),
2448 b'%.100u', c_uint(123))
2449 check_format('123'.rjust(80, '0').rjust(100),
2450 b'%100.80u', c_uint(123))
Victor Stinnere215d962012-10-06 23:03:36 +02002451
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002452 check_format('123'.rjust(10, '0'),
2453 b'%010x', c_int(0x123))
2454 check_format('123'.rjust(100),
2455 b'%100x', c_int(0x123))
2456 check_format('123'.rjust(100, '0'),
2457 b'%.100x', c_int(0x123))
2458 check_format('123'.rjust(80, '0').rjust(100),
2459 b'%100.80x', c_int(0x123))
Victor Stinnere215d962012-10-06 23:03:36 +02002460
Victor Stinner6d970f42011-03-02 00:04:25 +00002461 # test %A
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002462 check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
2463 b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
Victor Stinner9a909002010-10-18 20:59:24 +00002464
Victor Stinner6d970f42011-03-02 00:04:25 +00002465 # test %V
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002466 check_format('repr=abc',
2467 b'repr=%V', 'abc', b'xyz')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002468
2469 # Test string decode from parameter of %s using utf-8.
2470 # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
2471 # '\u4eba\u6c11'
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002472 check_format('repr=\u4eba\u6c11',
2473 b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002474
2475 #Test replace error handler.
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002476 check_format('repr=abc\ufffd',
2477 b'repr=%V', None, b'abc\xff')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002478
Victor Stinner6d970f42011-03-02 00:04:25 +00002479 # not supported: copy the raw format string. these tests are just here
2480 # to check for crashs and should not be considered as specifications
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002481 check_format('%s',
2482 b'%1%s', b'abc')
2483 check_format('%1abc',
2484 b'%1abc')
2485 check_format('%+i',
2486 b'%+i', c_int(10))
2487 check_format('%.%s',
2488 b'%.%s', b'abc')
Victor Stinner6d970f42011-03-02 00:04:25 +00002489
Victor Stinner1c24bd02010-10-02 11:03:13 +00002490 # Test PyUnicode_AsWideChar()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002491 @support.cpython_only
Victor Stinner1c24bd02010-10-02 11:03:13 +00002492 def test_aswidechar(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002493 from _testcapi import unicode_aswidechar
Antoine Pitrou0662bc22010-11-22 16:19:04 +00002494 support.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002495 from ctypes import c_wchar, sizeof
2496
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002497 wchar, size = unicode_aswidechar('abcdef', 2)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002498 self.assertEqual(size, 2)
2499 self.assertEqual(wchar, 'ab')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002500
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002501 wchar, size = unicode_aswidechar('abc', 3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002502 self.assertEqual(size, 3)
2503 self.assertEqual(wchar, 'abc')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002504
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002505 wchar, size = unicode_aswidechar('abc', 4)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002506 self.assertEqual(size, 3)
2507 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002508
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002509 wchar, size = unicode_aswidechar('abc', 10)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002510 self.assertEqual(size, 3)
2511 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002512
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002513 wchar, size = unicode_aswidechar('abc\0def', 20)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002514 self.assertEqual(size, 7)
2515 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002516
Victor Stinner5593d8a2010-10-02 11:11:27 +00002517 nonbmp = chr(0x10ffff)
2518 if sizeof(c_wchar) == 2:
2519 buflen = 3
2520 nchar = 2
2521 else: # sizeof(c_wchar) == 4
2522 buflen = 2
2523 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002524 wchar, size = unicode_aswidechar(nonbmp, buflen)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002525 self.assertEqual(size, nchar)
2526 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00002527
Victor Stinner1c24bd02010-10-02 11:03:13 +00002528 # Test PyUnicode_AsWideCharString()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002529 @support.cpython_only
Victor Stinner1c24bd02010-10-02 11:03:13 +00002530 def test_aswidecharstring(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002531 from _testcapi import unicode_aswidecharstring
Antoine Pitrou0662bc22010-11-22 16:19:04 +00002532 support.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002533 from ctypes import c_wchar, sizeof
2534
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002535 wchar, size = unicode_aswidecharstring('abc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00002536 self.assertEqual(size, 3)
2537 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002538
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002539 wchar, size = unicode_aswidecharstring('abc\0def')
Ezio Melottib3aedd42010-11-20 19:04:17 +00002540 self.assertEqual(size, 7)
2541 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002542
Victor Stinner5593d8a2010-10-02 11:11:27 +00002543 nonbmp = chr(0x10ffff)
2544 if sizeof(c_wchar) == 2:
2545 nchar = 2
2546 else: # sizeof(c_wchar) == 4
2547 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002548 wchar, size = unicode_aswidecharstring(nonbmp)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002549 self.assertEqual(size, nchar)
2550 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00002551
Benjamin Peterson811c2f12011-09-30 21:31:21 -04002552 def test_subclass_add(self):
2553 class S(str):
2554 def __add__(self, o):
2555 return "3"
2556 self.assertEqual(S("4") + S("5"), "3")
2557 class S(str):
2558 def __iadd__(self, o):
2559 return "3"
2560 s = S("1")
2561 s += "4"
2562 self.assertEqual(s, "3")
2563
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002564 @support.cpython_only
Victor Stinner42bf7752011-11-21 22:52:58 +01002565 def test_encode_decimal(self):
2566 from _testcapi import unicode_encodedecimal
2567 self.assertEqual(unicode_encodedecimal('123'),
2568 b'123')
2569 self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
2570 b'3.14')
2571 self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2572 b' 3.14 ')
2573 self.assertRaises(UnicodeEncodeError,
2574 unicode_encodedecimal, "123\u20ac", "strict")
Victor Stinner6345be92011-11-25 20:09:01 +01002575 self.assertRaisesRegex(
2576 ValueError,
2577 "^'decimal' codec can't encode character",
2578 unicode_encodedecimal, "123\u20ac", "replace")
Victor Stinner42bf7752011-11-21 22:52:58 +01002579
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002580 @support.cpython_only
Victor Stinner42bf7752011-11-21 22:52:58 +01002581 def test_transform_decimal(self):
2582 from _testcapi import unicode_transformdecimaltoascii as transform_decimal
2583 self.assertEqual(transform_decimal('123'),
2584 '123')
2585 self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
2586 '3.14')
2587 self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2588 "\N{EM SPACE}3.14\N{EN SPACE}")
2589 self.assertEqual(transform_decimal('123\u20ac'),
2590 '123\u20ac')
2591
Victor Stinnerc814a382011-11-22 01:06:15 +01002592 def test_getnewargs(self):
2593 text = 'abc'
2594 args = text.__getnewargs__()
2595 self.assertIsNot(args[0], text)
2596 self.assertEqual(args[0], text)
2597 self.assertEqual(len(args), 1)
2598
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01002599 def test_resize(self):
2600 for length in range(1, 100, 7):
2601 # generate a fresh string (refcount=1)
2602 text = 'a' * length + 'b'
2603
Ezio Melotti51e243f2013-02-20 23:56:01 +02002604 with support.check_warnings(('unicode_internal codec has been '
2605 'deprecated', DeprecationWarning)):
2606 # fill wstr internal field
2607 abc = text.encode('unicode_internal')
2608 self.assertEqual(abc.decode('unicode_internal'), text)
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01002609
Ezio Melotti51e243f2013-02-20 23:56:01 +02002610 # resize text: wstr field must be cleared and then recomputed
2611 text += 'c'
2612 abcdef = text.encode('unicode_internal')
2613 self.assertNotEqual(abc, abcdef)
2614 self.assertEqual(abcdef.decode('unicode_internal'), text)
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01002615
Victor Stinner9fc59812013-04-08 22:34:43 +02002616 def test_compare(self):
2617 # Issue #17615
2618 N = 10
2619 ascii = 'a' * N
2620 ascii2 = 'z' * N
2621 latin = '\x80' * N
2622 latin2 = '\xff' * N
2623 bmp = '\u0100' * N
2624 bmp2 = '\uffff' * N
2625 astral = '\U00100000' * N
2626 astral2 = '\U0010ffff' * N
2627 strings = (
2628 ascii, ascii2,
2629 latin, latin2,
2630 bmp, bmp2,
2631 astral, astral2)
2632 for text1, text2 in itertools.combinations(strings, 2):
2633 equal = (text1 is text2)
2634 self.assertEqual(text1 == text2, equal)
2635 self.assertEqual(text1 != text2, not equal)
2636
2637 if equal:
2638 self.assertTrue(text1 <= text2)
2639 self.assertTrue(text1 >= text2)
2640
2641 # text1 is text2: duplicate strings to skip the "str1 == str2"
2642 # optimization in unicode_compare_eq() and really compare
2643 # character per character
2644 copy1 = duplicate_string(text1)
2645 copy2 = duplicate_string(text2)
2646 self.assertIsNot(copy1, copy2)
2647
2648 self.assertTrue(copy1 == copy2)
2649 self.assertFalse(copy1 != copy2)
2650
2651 self.assertTrue(copy1 <= copy2)
2652 self.assertTrue(copy2 >= copy2)
2653
2654 self.assertTrue(ascii < ascii2)
2655 self.assertTrue(ascii < latin)
2656 self.assertTrue(ascii < bmp)
2657 self.assertTrue(ascii < astral)
2658 self.assertFalse(ascii >= ascii2)
2659 self.assertFalse(ascii >= latin)
2660 self.assertFalse(ascii >= bmp)
2661 self.assertFalse(ascii >= astral)
2662
2663 self.assertFalse(latin < ascii)
2664 self.assertTrue(latin < latin2)
2665 self.assertTrue(latin < bmp)
2666 self.assertTrue(latin < astral)
2667 self.assertTrue(latin >= ascii)
2668 self.assertFalse(latin >= latin2)
2669 self.assertFalse(latin >= bmp)
2670 self.assertFalse(latin >= astral)
2671
2672 self.assertFalse(bmp < ascii)
2673 self.assertFalse(bmp < latin)
2674 self.assertTrue(bmp < bmp2)
2675 self.assertTrue(bmp < astral)
2676 self.assertTrue(bmp >= ascii)
2677 self.assertTrue(bmp >= latin)
2678 self.assertFalse(bmp >= bmp2)
2679 self.assertFalse(bmp >= astral)
2680
2681 self.assertFalse(astral < ascii)
2682 self.assertFalse(astral < latin)
2683 self.assertFalse(astral < bmp2)
2684 self.assertTrue(astral < astral2)
2685 self.assertTrue(astral >= ascii)
2686 self.assertTrue(astral >= latin)
2687 self.assertTrue(astral >= bmp2)
2688 self.assertFalse(astral >= astral2)
2689
Victor Stinner1c24bd02010-10-02 11:03:13 +00002690
Eric Smitha1eac722011-01-29 11:15:35 +00002691class StringModuleTest(unittest.TestCase):
2692 def test_formatter_parser(self):
2693 def parse(format):
2694 return list(_string.formatter_parser(format))
2695
2696 formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
2697 self.assertEqual(formatter, [
2698 ('prefix ', '2', '', 's'),
2699 ('xxx', '0', '^+10.3f', None),
2700 ('', 'obj.attr', '', 's'),
2701 (' ', 'z[0]', '10', 's'),
2702 ])
2703
2704 formatter = parse("prefix {} suffix")
2705 self.assertEqual(formatter, [
2706 ('prefix ', '', '', None),
2707 (' suffix', None, None, None),
2708 ])
2709
2710 formatter = parse("str")
2711 self.assertEqual(formatter, [
2712 ('str', None, None, None),
2713 ])
2714
2715 formatter = parse("")
2716 self.assertEqual(formatter, [])
2717
2718 formatter = parse("{0}")
2719 self.assertEqual(formatter, [
2720 ('', '0', '', None),
2721 ])
2722
2723 self.assertRaises(TypeError, _string.formatter_parser, 1)
2724
2725 def test_formatter_field_name_split(self):
2726 def split(name):
2727 items = list(_string.formatter_field_name_split(name))
2728 items[1] = list(items[1])
2729 return items
2730 self.assertEqual(split("obj"), ["obj", []])
2731 self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
2732 self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
2733 self.assertEqual(split("obj.arg[key1][key2]"), [
2734 "obj",
2735 [(True, 'arg'),
2736 (False, 'key1'),
2737 (False, 'key2'),
2738 ]])
2739 self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
2740
2741
Walter Dörwald28256f22003-01-19 16:59:20 +00002742if __name__ == "__main__":
Ezio Melotti0dceb562013-01-10 07:43:26 +02002743 unittest.main()