blob: 2773fe5373e735314c900c0fb33f638129e0f4b6 [file] [log] [blame]
Guido van Rossuma831cac2000-03-10 23:23:21 +00001""" Test script for the Unicode implementation.
2
Guido van Rossuma831cac2000-03-10 23:23:21 +00003Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
Marc-André Lemburg36619082001-01-17 19:11:13 +00007"""#"
Victor Stinner040e16e2011-11-15 22:44:05 +01008import _string
Guido van Rossum98297ee2007-11-06 21:34:58 +00009import codecs
Victor Stinner9fc59812013-04-08 22:34:43 +020010import itertools
Ethan Furman9ab74802014-03-21 06:38:46 -070011import operator
Guido van Rossum98297ee2007-11-06 21:34:58 +000012import struct
13import sys
14import unittest
15import warnings
Benjamin Petersonee8712c2008-05-20 21:35:26 +000016from test import support, string_tests
Guido van Rossuma831cac2000-03-10 23:23:21 +000017
Neal Norwitz430f68b2005-11-24 22:00:56 +000018# Error handling (bad decoder return)
19def search_function(encoding):
20 def decode1(input, errors="strict"):
21 return 42 # not a tuple
22 def encode1(input, errors="strict"):
23 return 42 # not a tuple
24 def encode2(input, errors="strict"):
25 return (42, 42) # no unicode
26 def decode2(input, errors="strict"):
27 return (42, 42) # no unicode
28 if encoding=="test.unicode1":
29 return (encode1, decode1, None, None)
30 elif encoding=="test.unicode2":
31 return (encode2, decode2, None, None)
32 else:
33 return None
34codecs.register(search_function)
35
Victor Stinner9fc59812013-04-08 22:34:43 +020036def duplicate_string(text):
37 """
38 Try to get a fresh clone of the specified text:
39 new object with a reference count of 1.
40
41 This is a best-effort: latin1 single letters and the empty
42 string ('') are singletons and cannot be cloned.
43 """
44 return text.encode().decode()
45
Brett Cannon226b2302010-03-20 22:22:22 +000046class UnicodeTest(string_tests.CommonTest,
47 string_tests.MixinStrUnicodeUserStringTest,
Ezio Melotti0dceb562013-01-10 07:43:26 +020048 string_tests.MixinStrUnicodeTest,
49 unittest.TestCase):
Brett Cannon226b2302010-03-20 22:22:22 +000050
Guido van Rossumef87d6e2007-05-02 19:09:54 +000051 type2test = str
Walter Dörwald0fd583c2003-02-21 12:53:50 +000052
53 def checkequalnofix(self, result, object, methodname, *args):
54 method = getattr(object, methodname)
55 realresult = method(*args)
56 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000057 self.assertTrue(type(realresult) is type(result))
Walter Dörwald0fd583c2003-02-21 12:53:50 +000058
59 # if the original is returned make sure that
60 # this doesn't happen with subclasses
61 if realresult is object:
Guido van Rossumef87d6e2007-05-02 19:09:54 +000062 class usub(str):
Walter Dörwald0fd583c2003-02-21 12:53:50 +000063 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000064 return 'usub(%r)' % str.__repr__(self)
Walter Dörwald0fd583c2003-02-21 12:53:50 +000065 object = usub(object)
66 method = getattr(object, methodname)
67 realresult = method(*args)
68 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000069 self.assertTrue(object is not realresult)
Guido van Rossume4874ae2001-09-21 15:36:41 +000070
Jeremy Hylton504de6b2003-10-06 05:08:26 +000071 def test_literals(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000072 self.assertEqual('\xff', '\u00ff')
73 self.assertEqual('\uffff', '\U0000ffff')
Guido van Rossum36e0a922007-07-20 04:05:57 +000074 self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
75 self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
76 self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
Benjamin Petersoncd76c272008-04-05 15:09:30 +000077 # raw strings should not have unicode escapes
Florent Xiclunaa87b3832010-09-13 02:28:18 +000078 self.assertNotEqual(r"\u0020", " ")
Jeremy Hylton504de6b2003-10-06 05:08:26 +000079
Georg Brandl559e5d72008-06-11 18:37:52 +000080 def test_ascii(self):
81 if not sys.platform.startswith('java'):
82 # Test basic sanity of repr()
83 self.assertEqual(ascii('abc'), "'abc'")
84 self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
85 self.assertEqual(ascii('ab\\'), "'ab\\\\'")
86 self.assertEqual(ascii('\\c'), "'\\\\c'")
87 self.assertEqual(ascii('\\'), "'\\\\'")
88 self.assertEqual(ascii('\n'), "'\\n'")
89 self.assertEqual(ascii('\r'), "'\\r'")
90 self.assertEqual(ascii('\t'), "'\\t'")
91 self.assertEqual(ascii('\b'), "'\\x08'")
92 self.assertEqual(ascii("'\""), """'\\'"'""")
93 self.assertEqual(ascii("'\""), """'\\'"'""")
94 self.assertEqual(ascii("'"), '''"'"''')
95 self.assertEqual(ascii('"'), """'"'""")
96 latin1repr = (
97 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
98 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
99 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
100 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
101 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
102 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
103 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
104 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
105 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
106 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
107 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
108 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
109 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
110 "\\xfe\\xff'")
111 testrepr = ascii(''.join(map(chr, range(256))))
112 self.assertEqual(testrepr, latin1repr)
113 # Test ascii works on wide unicode escapes without overflow.
114 self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
115 ascii("\U00010000" * 39 + "\uffff" * 4096))
116
117 class WrongRepr:
118 def __repr__(self):
119 return b'byte-repr'
120 self.assertRaises(TypeError, ascii, WrongRepr())
121
Walter Dörwald28256f22003-01-19 16:59:20 +0000122 def test_repr(self):
123 if not sys.platform.startswith('java'):
124 # Test basic sanity of repr()
Walter Dörwald67e83882007-05-05 12:26:27 +0000125 self.assertEqual(repr('abc'), "'abc'")
126 self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
127 self.assertEqual(repr('ab\\'), "'ab\\\\'")
128 self.assertEqual(repr('\\c'), "'\\\\c'")
129 self.assertEqual(repr('\\'), "'\\\\'")
130 self.assertEqual(repr('\n'), "'\\n'")
131 self.assertEqual(repr('\r'), "'\\r'")
132 self.assertEqual(repr('\t'), "'\\t'")
133 self.assertEqual(repr('\b'), "'\\x08'")
134 self.assertEqual(repr("'\""), """'\\'"'""")
135 self.assertEqual(repr("'\""), """'\\'"'""")
136 self.assertEqual(repr("'"), '''"'"''')
137 self.assertEqual(repr('"'), """'"'""")
Walter Dörwald28256f22003-01-19 16:59:20 +0000138 latin1repr = (
Walter Dörwald67e83882007-05-05 12:26:27 +0000139 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
Walter Dörwald28256f22003-01-19 16:59:20 +0000140 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
141 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
142 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
143 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
144 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
Georg Brandl559e5d72008-06-11 18:37:52 +0000145 "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
146 "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
147 "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
148 "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
149 "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
150 "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
151 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
152 "\xfe\xff'")
Guido van Rossum805365e2007-05-07 22:24:25 +0000153 testrepr = repr(''.join(map(chr, range(256))))
Walter Dörwald28256f22003-01-19 16:59:20 +0000154 self.assertEqual(testrepr, latin1repr)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000155 # Test repr works on wide unicode escapes without overflow.
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000156 self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
157 repr("\U00010000" * 39 + "\uffff" * 4096))
Walter Dörwald28256f22003-01-19 16:59:20 +0000158
Georg Brandl559e5d72008-06-11 18:37:52 +0000159 class WrongRepr:
160 def __repr__(self):
161 return b'byte-repr'
162 self.assertRaises(TypeError, repr, WrongRepr())
163
Guido van Rossum49d6b072006-08-17 21:11:47 +0000164 def test_iterators(self):
165 # Make sure unicode objects have an __iter__ method
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000166 it = "\u1111\u2222\u3333".__iter__()
167 self.assertEqual(next(it), "\u1111")
168 self.assertEqual(next(it), "\u2222")
169 self.assertEqual(next(it), "\u3333")
Georg Brandla18af4e2007-04-21 15:47:16 +0000170 self.assertRaises(StopIteration, next, it)
Guido van Rossum49d6b072006-08-17 21:11:47 +0000171
Walter Dörwald28256f22003-01-19 16:59:20 +0000172 def test_count(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000173 string_tests.CommonTest.test_count(self)
174 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000175 self.checkequalnofix(3, 'aaa', 'count', 'a')
176 self.checkequalnofix(0, 'aaa', 'count', 'b')
177 self.checkequalnofix(3, 'aaa', 'count', 'a')
178 self.checkequalnofix(0, 'aaa', 'count', 'b')
179 self.checkequalnofix(0, 'aaa', 'count', 'b')
180 self.checkequalnofix(1, 'aaa', 'count', 'a', -1)
181 self.checkequalnofix(3, 'aaa', 'count', 'a', -10)
182 self.checkequalnofix(2, 'aaa', 'count', 'a', 0, -1)
183 self.checkequalnofix(0, 'aaa', 'count', 'a', 0, -10)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200184 # test mixed kinds
185 self.checkequal(10, '\u0102' + 'a' * 10, 'count', 'a')
186 self.checkequal(10, '\U00100304' + 'a' * 10, 'count', 'a')
187 self.checkequal(10, '\U00100304' + '\u0102' * 10, 'count', '\u0102')
188 self.checkequal(0, 'a' * 10, 'count', '\u0102')
189 self.checkequal(0, 'a' * 10, 'count', '\U00100304')
190 self.checkequal(0, '\u0102' * 10, 'count', '\U00100304')
191 self.checkequal(10, '\u0102' + 'a_' * 10, 'count', 'a_')
192 self.checkequal(10, '\U00100304' + 'a_' * 10, 'count', 'a_')
193 self.checkequal(10, '\U00100304' + '\u0102_' * 10, 'count', '\u0102_')
194 self.checkequal(0, 'a' * 10, 'count', 'a\u0102')
195 self.checkequal(0, 'a' * 10, 'count', 'a\U00100304')
196 self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000197
Walter Dörwald28256f22003-01-19 16:59:20 +0000198 def test_find(self):
Antoine Pitrouc0bbe7d2011-10-08 22:41:35 +0200199 string_tests.CommonTest.test_find(self)
Antoine Pitrou2c3b2302011-10-11 20:29:21 +0200200 # test implementation details of the memchr fast path
201 self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102')
202 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201')
203 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120')
204 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220')
205 self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304')
206 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204')
207 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004')
208 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000209 self.checkequalnofix(0, 'abcdefghiabc', 'find', 'abc')
210 self.checkequalnofix(9, 'abcdefghiabc', 'find', 'abc', 1)
211 self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000212
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000213 self.assertRaises(TypeError, 'hello'.find)
214 self.assertRaises(TypeError, 'hello'.find, 42)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200215 # test mixed kinds
216 self.checkequal(100, '\u0102' * 100 + 'a', 'find', 'a')
217 self.checkequal(100, '\U00100304' * 100 + 'a', 'find', 'a')
218 self.checkequal(100, '\U00100304' * 100 + '\u0102', 'find', '\u0102')
219 self.checkequal(-1, 'a' * 100, 'find', '\u0102')
220 self.checkequal(-1, 'a' * 100, 'find', '\U00100304')
221 self.checkequal(-1, '\u0102' * 100, 'find', '\U00100304')
222 self.checkequal(100, '\u0102' * 100 + 'a_', 'find', 'a_')
223 self.checkequal(100, '\U00100304' * 100 + 'a_', 'find', 'a_')
224 self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'find', '\u0102_')
225 self.checkequal(-1, 'a' * 100, 'find', 'a\u0102')
226 self.checkequal(-1, 'a' * 100, 'find', 'a\U00100304')
227 self.checkequal(-1, '\u0102' * 100, 'find', '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000228
Walter Dörwald28256f22003-01-19 16:59:20 +0000229 def test_rfind(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000230 string_tests.CommonTest.test_rfind(self)
Antoine Pitrou2c3b2302011-10-11 20:29:21 +0200231 # test implementation details of the memrchr fast path
232 self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102')
233 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201')
234 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120')
235 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220')
236 self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304')
237 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204')
238 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000239 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000240 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', 'abc')
241 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
242 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200243 # test mixed kinds
244 self.checkequal(0, 'a' + '\u0102' * 100, 'rfind', 'a')
245 self.checkequal(0, 'a' + '\U00100304' * 100, 'rfind', 'a')
246 self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rfind', '\u0102')
247 self.checkequal(-1, 'a' * 100, 'rfind', '\u0102')
248 self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304')
249 self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304')
250 self.checkequal(0, '_a' + '\u0102' * 100, 'rfind', '_a')
251 self.checkequal(0, '_a' + '\U00100304' * 100, 'rfind', '_a')
252 self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rfind', '_\u0102')
253 self.checkequal(-1, 'a' * 100, 'rfind', '\u0102a')
254 self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304a')
255 self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304\u0102')
Guido van Rossum8b264542000-12-19 02:22:31 +0000256
Walter Dörwald28256f22003-01-19 16:59:20 +0000257 def test_index(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000258 string_tests.CommonTest.test_index(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000259 self.checkequalnofix(0, 'abcdefghiabc', 'index', '')
260 self.checkequalnofix(3, 'abcdefghiabc', 'index', 'def')
261 self.checkequalnofix(0, 'abcdefghiabc', 'index', 'abc')
262 self.checkequalnofix(9, 'abcdefghiabc', 'index', 'abc', 1)
263 self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
264 self.assertRaises(ValueError, 'abcdefghiab'.index, 'abc', 1)
265 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', 8)
266 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', -1)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200267 # test mixed kinds
268 self.checkequal(100, '\u0102' * 100 + 'a', 'index', 'a')
269 self.checkequal(100, '\U00100304' * 100 + 'a', 'index', 'a')
270 self.checkequal(100, '\U00100304' * 100 + '\u0102', 'index', '\u0102')
271 self.assertRaises(ValueError, ('a' * 100).index, '\u0102')
272 self.assertRaises(ValueError, ('a' * 100).index, '\U00100304')
273 self.assertRaises(ValueError, ('\u0102' * 100).index, '\U00100304')
274 self.checkequal(100, '\u0102' * 100 + 'a_', 'index', 'a_')
275 self.checkequal(100, '\U00100304' * 100 + 'a_', 'index', 'a_')
276 self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'index', '\u0102_')
277 self.assertRaises(ValueError, ('a' * 100).index, 'a\u0102')
278 self.assertRaises(ValueError, ('a' * 100).index, 'a\U00100304')
279 self.assertRaises(ValueError, ('\u0102' * 100).index, '\u0102\U00100304')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000280
Walter Dörwald28256f22003-01-19 16:59:20 +0000281 def test_rindex(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000282 string_tests.CommonTest.test_rindex(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000283 self.checkequalnofix(12, 'abcdefghiabc', 'rindex', '')
284 self.checkequalnofix(3, 'abcdefghiabc', 'rindex', 'def')
285 self.checkequalnofix(9, 'abcdefghiabc', 'rindex', 'abc')
286 self.checkequalnofix(0, 'abcdefghiabc', 'rindex', 'abc', 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000287
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000288 self.assertRaises(ValueError, 'abcdefghiabc'.rindex, 'hib')
289 self.assertRaises(ValueError, 'defghiabc'.rindex, 'def', 1)
290 self.assertRaises(ValueError, 'defghiabc'.rindex, 'abc', 0, -1)
291 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, 8)
292 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, -1)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200293 # test mixed kinds
294 self.checkequal(0, 'a' + '\u0102' * 100, 'rindex', 'a')
295 self.checkequal(0, 'a' + '\U00100304' * 100, 'rindex', 'a')
296 self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rindex', '\u0102')
297 self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102')
298 self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304')
299 self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304')
300 self.checkequal(0, '_a' + '\u0102' * 100, 'rindex', '_a')
301 self.checkequal(0, '_a' + '\U00100304' * 100, 'rindex', '_a')
302 self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rindex', '_\u0102')
303 self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102a')
304 self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304a')
305 self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304\u0102')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000306
Georg Brandlceee0772007-11-27 23:48:05 +0000307 def test_maketrans_translate(self):
308 # these work with plain translate()
309 self.checkequalnofix('bbbc', 'abababc', 'translate',
310 {ord('a'): None})
311 self.checkequalnofix('iiic', 'abababc', 'translate',
312 {ord('a'): None, ord('b'): ord('i')})
313 self.checkequalnofix('iiix', 'abababc', 'translate',
314 {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
315 self.checkequalnofix('c', 'abababc', 'translate',
316 {ord('a'): None, ord('b'): ''})
317 self.checkequalnofix('xyyx', 'xzx', 'translate',
318 {ord('z'): 'yy'})
Victor Stinner5a29f252014-04-05 00:17:51 +0200319
Georg Brandlceee0772007-11-27 23:48:05 +0000320 # this needs maketrans()
321 self.checkequalnofix('abababc', 'abababc', 'translate',
322 {'b': '<i>'})
323 tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
324 self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
325 # test alternative way of calling maketrans()
326 tbl = self.type2test.maketrans('abc', 'xyz', 'd')
327 self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
328
Victor Stinner5a29f252014-04-05 00:17:51 +0200329 # various tests switching from ASCII to latin1 or the opposite;
330 # same length, remove a letter, or replace with a longer string.
331 self.assertEqual("[a]".translate(str.maketrans('a', 'X')),
332 "[X]")
333 self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})),
334 "[X]")
335 self.assertEqual("[a]".translate(str.maketrans({'a': None})),
336 "[]")
337 self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})),
338 "[XXX]")
339 self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})),
340 "[\xe9]")
341 self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})),
342 "[<\xe9>]")
343 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})),
344 "[a]")
345 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})),
346 "[]")
347
Victor Stinner4ff33af2014-04-05 11:56:37 +0200348 # invalid Unicode characters
349 invalid_char = 0x10ffff+1
350 for before in "a\xe9\u20ac\U0010ffff":
351 mapping = str.maketrans({before: invalid_char})
352 text = "[%s]" % before
353 self.assertRaises(ValueError, text.translate, mapping)
354
355 # errors
Georg Brandlceee0772007-11-27 23:48:05 +0000356 self.assertRaises(TypeError, self.type2test.maketrans)
357 self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
358 self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
359 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
360 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
361 self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
362 self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000363
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000364 self.assertRaises(TypeError, 'hello'.translate)
Walter Dörwald67e83882007-05-05 12:26:27 +0000365 self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000366
Walter Dörwald28256f22003-01-19 16:59:20 +0000367 def test_split(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000368 string_tests.CommonTest.test_split(self)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000369
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000370 # Mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000371 self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
372 self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
373 self.checkequalnofix(['endcase ', ''], 'endcase test', 'split', 'test')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200374 # test mixed kinds
375 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
376 left *= 9
377 right *= 9
378 for delim in ('c', '\u0102', '\U00010302'):
379 self.checkequal([left + right],
380 left + right, 'split', delim)
381 self.checkequal([left, right],
382 left + delim + right, 'split', delim)
383 self.checkequal([left + right],
384 left + right, 'split', delim * 2)
385 self.checkequal([left, right],
386 left + delim * 2 + right, 'split', delim *2)
387
388 def test_rsplit(self):
389 string_tests.CommonTest.test_rsplit(self)
390 # test mixed kinds
391 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
392 left *= 9
393 right *= 9
394 for delim in ('c', '\u0102', '\U00010302'):
395 self.checkequal([left + right],
396 left + right, 'rsplit', delim)
397 self.checkequal([left, right],
398 left + delim + right, 'rsplit', delim)
399 self.checkequal([left + right],
400 left + right, 'rsplit', delim * 2)
401 self.checkequal([left, right],
402 left + delim * 2 + right, 'rsplit', delim *2)
403
404 def test_partition(self):
405 string_tests.MixinStrUnicodeUserStringTest.test_partition(self)
406 # test mixed kinds
Serhiy Storchaka48070c12015-03-29 19:21:02 +0300407 self.checkequal(('ABCDEFGH', '', ''), 'ABCDEFGH', 'partition', '\u4200')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200408 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
409 left *= 9
410 right *= 9
411 for delim in ('c', '\u0102', '\U00010302'):
412 self.checkequal((left + right, '', ''),
413 left + right, 'partition', delim)
414 self.checkequal((left, delim, right),
415 left + delim + right, 'partition', delim)
416 self.checkequal((left + right, '', ''),
417 left + right, 'partition', delim * 2)
418 self.checkequal((left, delim * 2, right),
419 left + delim * 2 + right, 'partition', delim * 2)
420
421 def test_rpartition(self):
422 string_tests.MixinStrUnicodeUserStringTest.test_rpartition(self)
423 # test mixed kinds
Serhiy Storchaka48070c12015-03-29 19:21:02 +0300424 self.checkequal(('', '', 'ABCDEFGH'), 'ABCDEFGH', 'rpartition', '\u4200')
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200425 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
426 left *= 9
427 right *= 9
428 for delim in ('c', '\u0102', '\U00010302'):
429 self.checkequal(('', '', left + right),
430 left + right, 'rpartition', delim)
431 self.checkequal((left, delim, right),
432 left + delim + right, 'rpartition', delim)
433 self.checkequal(('', '', left + right),
434 left + right, 'rpartition', delim * 2)
435 self.checkequal((left, delim * 2, right),
436 left + delim * 2 + right, 'rpartition', delim * 2)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000437
Walter Dörwald28256f22003-01-19 16:59:20 +0000438 def test_join(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000439 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000440
Guido van Rossumf1044292007-09-27 18:01:22 +0000441 class MyWrapper:
442 def __init__(self, sval): self.sval = sval
443 def __str__(self): return self.sval
444
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000445 # mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000446 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
447 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
448 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
449 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
450 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
451 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
452 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
Guido van Rossum98297ee2007-11-06 21:34:58 +0000453 self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
454 self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
455 self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
456 self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
Marc-André Lemburge5034372000-08-08 08:04:29 +0000457
Walter Dörwald28256f22003-01-19 16:59:20 +0000458 def test_replace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000459 string_tests.CommonTest.test_replace(self)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000460
Walter Dörwald28256f22003-01-19 16:59:20 +0000461 # method call forwarded from str implementation because of unicode argument
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000462 self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
463 self.assertRaises(TypeError, 'replace'.replace, "r", 42)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200464 # test mixed kinds
465 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
466 left *= 9
467 right *= 9
468 for delim in ('c', '\u0102', '\U00010302'):
469 for repl in ('d', '\u0103', '\U00010303'):
470 self.checkequal(left + right,
471 left + right, 'replace', delim, repl)
472 self.checkequal(left + repl + right,
473 left + delim + right,
474 'replace', delim, repl)
475 self.checkequal(left + right,
476 left + right, 'replace', delim * 2, repl)
477 self.checkequal(left + repl + right,
478 left + delim * 2 + right,
479 'replace', delim * 2, repl)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000480
Victor Stinner59de0ee2011-10-07 10:01:28 +0200481 @support.cpython_only
482 def test_replace_id(self):
Victor Stinner1d972ad2011-10-07 13:31:46 +0200483 pattern = 'abc'
484 text = 'abc def'
485 self.assertIs(text.replace(pattern, pattern), text)
Victor Stinner59de0ee2011-10-07 10:01:28 +0200486
Guido van Rossum98297ee2007-11-06 21:34:58 +0000487 def test_bytes_comparison(self):
Brett Cannon226b2302010-03-20 22:22:22 +0000488 with support.check_warnings():
489 warnings.simplefilter('ignore', BytesWarning)
490 self.assertEqual('abc' == b'abc', False)
491 self.assertEqual('abc' != b'abc', True)
492 self.assertEqual('abc' == bytearray(b'abc'), False)
493 self.assertEqual('abc' != bytearray(b'abc'), True)
Brett Cannon40430012007-10-22 20:24:51 +0000494
Walter Dörwald28256f22003-01-19 16:59:20 +0000495 def test_comparison(self):
496 # Comparisons:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000497 self.assertEqual('abc', 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000498 self.assertTrue('abcd' > 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000499 self.assertTrue('abc' < 'abcd')
Walter Dörwald28256f22003-01-19 16:59:20 +0000500
501 if 0:
502 # Move these tests to a Unicode collation module test...
503 # Testing UTF-16 code point order comparisons...
504
505 # No surrogates, no fixup required.
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000506 self.assertTrue('\u0061' < '\u20ac')
Walter Dörwald28256f22003-01-19 16:59:20 +0000507 # Non surrogate below surrogate value, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000508 self.assertTrue('\u0061' < '\ud800\udc02')
Walter Dörwald28256f22003-01-19 16:59:20 +0000509
510 # Non surrogate above surrogate value, fixup required
511 def test_lecmp(s, s2):
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000512 self.assertTrue(s < s2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000513
514 def test_fixup(s):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000515 s2 = '\ud800\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000516 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000517 s2 = '\ud900\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000518 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000519 s2 = '\uda00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000520 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000521 s2 = '\udb00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000522 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000523 s2 = '\ud800\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000524 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000525 s2 = '\ud900\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000526 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000527 s2 = '\uda00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000528 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000529 s2 = '\udb00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000530 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000531 s2 = '\ud800\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000532 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000533 s2 = '\ud900\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000534 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000535 s2 = '\uda00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000536 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000537 s2 = '\udb00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000538 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000539 s2 = '\ud800\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000540 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000541 s2 = '\ud900\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000542 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000543 s2 = '\uda00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000544 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000545 s2 = '\udb00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000546 test_lecmp(s, s2)
547
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000548 test_fixup('\ue000')
549 test_fixup('\uff61')
Walter Dörwald28256f22003-01-19 16:59:20 +0000550
551 # Surrogates on both sides, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000552 self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
Walter Dörwald28256f22003-01-19 16:59:20 +0000553
Walter Dörwald28256f22003-01-19 16:59:20 +0000554 def test_islower(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000555 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000556 self.checkequalnofix(False, '\u1FFc', 'islower')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500557 self.assertFalse('\u2167'.islower())
558 self.assertTrue('\u2177'.islower())
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300559 # non-BMP, uppercase
560 self.assertFalse('\U00010401'.islower())
561 self.assertFalse('\U00010427'.islower())
562 # non-BMP, lowercase
563 self.assertTrue('\U00010429'.islower())
564 self.assertTrue('\U0001044E'.islower())
565 # non-BMP, non-cased
566 self.assertFalse('\U0001F40D'.islower())
567 self.assertFalse('\U0001F46F'.islower())
Walter Dörwald28256f22003-01-19 16:59:20 +0000568
569 def test_isupper(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000570 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
571 if not sys.platform.startswith('java'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000572 self.checkequalnofix(False, '\u1FFc', 'isupper')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500573 self.assertTrue('\u2167'.isupper())
574 self.assertFalse('\u2177'.isupper())
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300575 # non-BMP, uppercase
576 self.assertTrue('\U00010401'.isupper())
577 self.assertTrue('\U00010427'.isupper())
578 # non-BMP, lowercase
579 self.assertFalse('\U00010429'.isupper())
580 self.assertFalse('\U0001044E'.isupper())
581 # non-BMP, non-cased
582 self.assertFalse('\U0001F40D'.isupper())
583 self.assertFalse('\U0001F46F'.isupper())
Walter Dörwald28256f22003-01-19 16:59:20 +0000584
585 def test_istitle(self):
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300586 string_tests.MixinStrUnicodeUserStringTest.test_istitle(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000587 self.checkequalnofix(True, '\u1FFc', 'istitle')
588 self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
Walter Dörwald28256f22003-01-19 16:59:20 +0000589
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300590 # non-BMP, uppercase + lowercase
591 self.assertTrue('\U00010401\U00010429'.istitle())
592 self.assertTrue('\U00010427\U0001044E'.istitle())
593 # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
594 for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
595 self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
596
Walter Dörwald28256f22003-01-19 16:59:20 +0000597 def test_isspace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000598 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000599 self.checkequalnofix(True, '\u2000', 'isspace')
600 self.checkequalnofix(True, '\u200a', 'isspace')
601 self.checkequalnofix(False, '\u2014', 'isspace')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300602 # apparently there are no non-BMP spaces chars in Unicode 6
603 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
604 '\U0001F40D', '\U0001F46F']:
605 self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
606
607 def test_isalnum(self):
608 string_tests.MixinStrUnicodeUserStringTest.test_isalnum(self)
609 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
610 '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
611 self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
Walter Dörwald28256f22003-01-19 16:59:20 +0000612
613 def test_isalpha(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000614 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000615 self.checkequalnofix(True, '\u1FFc', 'isalpha')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300616 # non-BMP, cased
617 self.assertTrue('\U00010401'.isalpha())
618 self.assertTrue('\U00010427'.isalpha())
619 self.assertTrue('\U00010429'.isalpha())
620 self.assertTrue('\U0001044E'.isalpha())
621 # non-BMP, non-cased
622 self.assertFalse('\U0001F40D'.isalpha())
623 self.assertFalse('\U0001F46F'.isalpha())
Walter Dörwald28256f22003-01-19 16:59:20 +0000624
625 def test_isdecimal(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000626 self.checkequalnofix(False, '', 'isdecimal')
627 self.checkequalnofix(False, 'a', 'isdecimal')
628 self.checkequalnofix(True, '0', 'isdecimal')
629 self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
630 self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
631 self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
632 self.checkequalnofix(True, '0123456789', 'isdecimal')
633 self.checkequalnofix(False, '0123456789a', 'isdecimal')
Walter Dörwald28256f22003-01-19 16:59:20 +0000634
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000635 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000636
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300637 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
638 '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
639 self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
640 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
641 self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
642
Walter Dörwald28256f22003-01-19 16:59:20 +0000643 def test_isdigit(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000644 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000645 self.checkequalnofix(True, '\u2460', 'isdigit')
646 self.checkequalnofix(False, '\xbc', 'isdigit')
647 self.checkequalnofix(True, '\u0660', 'isdigit')
Walter Dörwald28256f22003-01-19 16:59:20 +0000648
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300649 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
650 '\U0001F40D', '\U0001F46F', '\U00011065']:
651 self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
652 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
653 self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
654
Walter Dörwald28256f22003-01-19 16:59:20 +0000655 def test_isnumeric(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000656 self.checkequalnofix(False, '', 'isnumeric')
657 self.checkequalnofix(False, 'a', 'isnumeric')
658 self.checkequalnofix(True, '0', 'isnumeric')
659 self.checkequalnofix(True, '\u2460', 'isnumeric')
660 self.checkequalnofix(True, '\xbc', 'isnumeric')
661 self.checkequalnofix(True, '\u0660', 'isnumeric')
662 self.checkequalnofix(True, '0123456789', 'isnumeric')
663 self.checkequalnofix(False, '0123456789a', 'isnumeric')
Walter Dörwald28256f22003-01-19 16:59:20 +0000664
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000665 self.assertRaises(TypeError, "abc".isnumeric, 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000666
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300667 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
668 '\U0001F40D', '\U0001F46F']:
669 self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
670 for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
671 '\U000104A0', '\U0001F107']:
672 self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
673
Martin v. Löwis47383402007-08-15 07:32:56 +0000674 def test_isidentifier(self):
675 self.assertTrue("a".isidentifier())
676 self.assertTrue("Z".isidentifier())
677 self.assertTrue("_".isidentifier())
678 self.assertTrue("b0".isidentifier())
679 self.assertTrue("bc".isidentifier())
680 self.assertTrue("b_".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000681 self.assertTrue("µ".isidentifier())
Benjamin Petersonf413b802011-08-12 22:17:18 -0500682 self.assertTrue("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000683
684 self.assertFalse(" ".isidentifier())
685 self.assertFalse("[".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000686 self.assertFalse("©".isidentifier())
Georg Brandld52429f2008-07-04 15:55:02 +0000687 self.assertFalse("0".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000688
Georg Brandl559e5d72008-06-11 18:37:52 +0000689 def test_isprintable(self):
690 self.assertTrue("".isprintable())
Benjamin Peterson09832742009-03-26 17:15:46 +0000691 self.assertTrue(" ".isprintable())
Georg Brandl559e5d72008-06-11 18:37:52 +0000692 self.assertTrue("abcdefg".isprintable())
693 self.assertFalse("abcdefg\n".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000694 # some defined Unicode character
695 self.assertTrue("\u0374".isprintable())
696 # undefined character
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +0000697 self.assertFalse("\u0378".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000698 # single surrogate character
Georg Brandl559e5d72008-06-11 18:37:52 +0000699 self.assertFalse("\ud800".isprintable())
700
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300701 self.assertTrue('\U0001F46F'.isprintable())
702 self.assertFalse('\U000E0020'.isprintable())
703
704 def test_surrogates(self):
705 for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
706 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
707 self.assertTrue(s.islower())
708 self.assertFalse(s.isupper())
709 self.assertFalse(s.istitle())
710 for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
711 'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
712 self.assertFalse(s.islower())
713 self.assertTrue(s.isupper())
714 self.assertTrue(s.istitle())
715
716 for meth_name in ('islower', 'isupper', 'istitle'):
717 meth = getattr(str, meth_name)
718 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
719 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
720
721 for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
722 'isdecimal', 'isnumeric',
723 'isidentifier', 'isprintable'):
724 meth = getattr(str, meth_name)
725 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
726 'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
727 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
728 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
729
730
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300731 def test_lower(self):
732 string_tests.CommonTest.test_lower(self)
733 self.assertEqual('\U00010427'.lower(), '\U0001044F')
734 self.assertEqual('\U00010427\U00010427'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300735 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300736 self.assertEqual('\U00010427\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300737 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300738 self.assertEqual('X\U00010427x\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300739 'x\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500740 self.assertEqual('fi'.lower(), 'fi')
741 self.assertEqual('\u0130'.lower(), '\u0069\u0307')
742 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
743 self.assertEqual('\u03a3'.lower(), '\u03c3')
744 self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
745 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
746 self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
747 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
748 self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
749 self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
750 self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
751 self.assertEqual('\u2177'.lower(), '\u2177')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300752
Benjamin Petersond5890c82012-01-14 13:23:30 -0500753 def test_casefold(self):
754 self.assertEqual('hello'.casefold(), 'hello')
755 self.assertEqual('hELlo'.casefold(), 'hello')
756 self.assertEqual('ß'.casefold(), 'ss')
757 self.assertEqual('fi'.casefold(), 'fi')
758 self.assertEqual('\u03a3'.casefold(), '\u03c3')
759 self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
Benjamin Peterson4eda9372012-08-05 15:05:34 -0700760 self.assertEqual('\u00b5'.casefold(), '\u03bc')
Benjamin Petersond5890c82012-01-14 13:23:30 -0500761
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300762 def test_upper(self):
763 string_tests.CommonTest.test_upper(self)
764 self.assertEqual('\U0001044F'.upper(), '\U00010427')
765 self.assertEqual('\U0001044F\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300766 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300767 self.assertEqual('\U00010427\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300768 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300769 self.assertEqual('X\U00010427x\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300770 'X\U00010427X\U00010427')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500771 self.assertEqual('fi'.upper(), 'FI')
772 self.assertEqual('\u0130'.upper(), '\u0130')
773 self.assertEqual('\u03a3'.upper(), '\u03a3')
774 self.assertEqual('ß'.upper(), 'SS')
775 self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
776 self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
777 self.assertEqual('\u2177'.upper(), '\u2167')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300778
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300779 def test_capitalize(self):
780 string_tests.CommonTest.test_capitalize(self)
781 self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
782 self.assertEqual('\U0001044F\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300783 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300784 self.assertEqual('\U00010427\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300785 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300786 self.assertEqual('\U0001044F\U00010427'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300787 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300788 self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300789 'X\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500790 self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
791 exp = '\u0399\u0308\u0300\u0069\u0307'
792 self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
793 self.assertEqual('finnish'.capitalize(), 'FInnish')
794 self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300795
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300796 def test_title(self):
797 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
798 self.assertEqual('\U0001044F'.title(), '\U00010427')
799 self.assertEqual('\U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300800 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300801 self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300802 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300803 self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300804 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300805 self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300806 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300807 self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300808 'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500809 self.assertEqual('fiNNISH'.title(), 'Finnish')
810 self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
811 self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300812
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300813 def test_swapcase(self):
814 string_tests.CommonTest.test_swapcase(self)
815 self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
816 self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
817 self.assertEqual('\U0001044F\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300818 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300819 self.assertEqual('\U00010427\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300820 '\U0001044F\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300821 self.assertEqual('\U0001044F\U00010427'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300822 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300823 self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300824 'x\U0001044FX\U00010427')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500825 self.assertEqual('fi'.swapcase(), 'FI')
826 self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
827 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
828 self.assertEqual('\u03a3'.swapcase(), '\u03c3')
829 self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
830 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
831 self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
832 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
833 self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
834 self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
835 self.assertEqual('\u03a3'.swapcase(), '\u03c3')
836 self.assertEqual('ß'.swapcase(), 'SS')
837 self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300838
Ezio Melottif84e01d2013-07-08 17:48:29 +0200839 def test_center(self):
840 string_tests.CommonTest.test_center(self)
841 self.assertEqual('x'.center(2, '\U0010FFFF'),
842 'x\U0010FFFF')
843 self.assertEqual('x'.center(3, '\U0010FFFF'),
844 '\U0010FFFFx\U0010FFFF')
845 self.assertEqual('x'.center(4, '\U0010FFFF'),
846 '\U0010FFFFx\U0010FFFF\U0010FFFF')
847
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400848 @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system")
Benjamin Peterson4d856892014-10-15 13:39:46 -0400849 @support.cpython_only
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400850 def test_case_operation_overflow(self):
851 # Issue #22643
852 self.assertRaises(OverflowError, ("ü"*(2**32//12 + 1)).upper)
853
Walter Dörwald28256f22003-01-19 16:59:20 +0000854 def test_contains(self):
855 # Testing Unicode contains method
Benjamin Peterson577473f2010-01-19 00:09:57 +0000856 self.assertIn('a', 'abdb')
857 self.assertIn('a', 'bdab')
858 self.assertIn('a', 'bdaba')
859 self.assertIn('a', 'bdba')
860 self.assertNotIn('a', 'bdb')
861 self.assertIn('a', 'bdba')
862 self.assertIn('a', ('a',1,None))
863 self.assertIn('a', (1,None,'a'))
864 self.assertIn('a', ('a',1,None))
865 self.assertIn('a', (1,None,'a'))
866 self.assertNotIn('a', ('x',1,'y'))
867 self.assertNotIn('a', ('x',1,None))
868 self.assertNotIn('abcd', 'abcxxxx')
869 self.assertIn('ab', 'abcd')
870 self.assertIn('ab', 'abc')
871 self.assertIn('ab', (1,None,'ab'))
872 self.assertIn('', 'abc')
873 self.assertIn('', '')
874 self.assertIn('', 'abc')
875 self.assertNotIn('\0', 'abc')
876 self.assertIn('\0', '\0abc')
877 self.assertIn('\0', 'abc\0')
878 self.assertIn('a', '\0abc')
879 self.assertIn('asdf', 'asdf')
880 self.assertNotIn('asdf', 'asd')
881 self.assertNotIn('asdf', '')
Walter Dörwald28256f22003-01-19 16:59:20 +0000882
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000883 self.assertRaises(TypeError, "abc".__contains__)
Serhiy Storchakabe1eb142015-03-24 21:48:30 +0200884 # test mixed kinds
885 for fill in ('a', '\u0100', '\U00010300'):
886 fill *= 9
887 for delim in ('c', '\u0102', '\U00010302'):
888 self.assertNotIn(delim, fill)
889 self.assertIn(delim, fill + delim)
890 self.assertNotIn(delim * 2, fill)
891 self.assertIn(delim * 2, fill + delim * 2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000892
Serhiy Storchaka31b1c8b2013-06-12 09:20:44 +0300893 def test_issue18183(self):
894 '\U00010000\U00100000'.lower()
895 '\U00010000\U00100000'.casefold()
896 '\U00010000\U00100000'.upper()
897 '\U00010000\U00100000'.capitalize()
898 '\U00010000\U00100000'.title()
899 '\U00010000\U00100000'.swapcase()
900 '\U00100000'.center(3, '\U00010000')
901 '\U00100000'.ljust(3, '\U00010000')
902 '\U00100000'.rjust(3, '\U00010000')
903
Eric Smith8c663262007-08-25 02:26:07 +0000904 def test_format(self):
905 self.assertEqual(''.format(), '')
906 self.assertEqual('a'.format(), 'a')
907 self.assertEqual('ab'.format(), 'ab')
908 self.assertEqual('a{{'.format(), 'a{')
909 self.assertEqual('a}}'.format(), 'a}')
910 self.assertEqual('{{b'.format(), '{b')
911 self.assertEqual('}}b'.format(), '}b')
912 self.assertEqual('a{{b'.format(), 'a{b')
913
914 # examples from the PEP:
915 import datetime
916 self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
917 self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
918 "My name is Fred")
919 self.assertEqual("My name is {0} :-{{}}".format('Fred'),
920 "My name is Fred :-{}")
921
922 d = datetime.date(2007, 8, 18)
923 self.assertEqual("The year is {0.year}".format(d),
924 "The year is 2007")
925
Eric Smith8c663262007-08-25 02:26:07 +0000926 # classes we'll use for testing
927 class C:
928 def __init__(self, x=100):
929 self._x = x
930 def __format__(self, spec):
931 return spec
932
933 class D:
934 def __init__(self, x):
935 self.x = x
936 def __format__(self, spec):
937 return str(self.x)
938
939 # class with __str__, but no __format__
940 class E:
941 def __init__(self, x):
942 self.x = x
943 def __str__(self):
944 return 'E(' + self.x + ')'
945
946 # class with __repr__, but no __format__ or __str__
947 class F:
948 def __init__(self, x):
949 self.x = x
950 def __repr__(self):
951 return 'F(' + self.x + ')'
952
953 # class with __format__ that forwards to string, for some format_spec's
954 class G:
955 def __init__(self, x):
956 self.x = x
957 def __str__(self):
958 return "string is " + self.x
959 def __format__(self, format_spec):
960 if format_spec == 'd':
961 return 'G(' + self.x + ')'
962 return object.__format__(self, format_spec)
963
Eric Smith739e2ad2007-08-27 19:07:22 +0000964 class I(datetime.date):
965 def __format__(self, format_spec):
966 return self.strftime(format_spec)
967
Eric Smith185e30c2007-08-30 22:23:08 +0000968 class J(int):
969 def __format__(self, format_spec):
970 return int.__format__(self * 2, format_spec)
971
Eric Smith8c663262007-08-25 02:26:07 +0000972
973 self.assertEqual(''.format(), '')
974 self.assertEqual('abc'.format(), 'abc')
975 self.assertEqual('{0}'.format('abc'), 'abc')
976 self.assertEqual('{0:}'.format('abc'), 'abc')
977# self.assertEqual('{ 0 }'.format('abc'), 'abc')
978 self.assertEqual('X{0}'.format('abc'), 'Xabc')
979 self.assertEqual('{0}X'.format('abc'), 'abcX')
980 self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
981 self.assertEqual('{1}'.format(1, 'abc'), 'abc')
982 self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
983 self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
984 self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
985 self.assertEqual('{0}'.format(-15), '-15')
986 self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
987 self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
988 self.assertEqual('{{'.format(), '{')
989 self.assertEqual('}}'.format(), '}')
990 self.assertEqual('{{}}'.format(), '{}')
991 self.assertEqual('{{x}}'.format(), '{x}')
992 self.assertEqual('{{{0}}}'.format(123), '{123}')
993 self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
994 self.assertEqual('}}{{'.format(), '}{')
995 self.assertEqual('}}x{{'.format(), '}x{')
996
Eric Smith7ade6482007-08-26 22:27:13 +0000997 # weird field names
998 self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
999 self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
Eric Smith4cb4e4e2007-09-03 08:40:29 +00001000 self.assertEqual("{0[ ]}".format({' ':3}), '3')
Eric Smith7ade6482007-08-26 22:27:13 +00001001
Eric Smith8c663262007-08-25 02:26:07 +00001002 self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
1003 self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
1004 self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
1005 self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
1006 self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
1007 self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
1008 self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
1009
Eric Smith8c663262007-08-25 02:26:07 +00001010 # strings
1011 self.assertEqual('{0:.3s}'.format('abc'), 'abc')
1012 self.assertEqual('{0:.3s}'.format('ab'), 'ab')
1013 self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
1014 self.assertEqual('{0:.0s}'.format('abcdef'), '')
1015 self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
1016 self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
1017 self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
1018 self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
1019 self.assertEqual('{0:x<0s}'.format('result'), 'result')
1020 self.assertEqual('{0:x<5s}'.format('result'), 'result')
1021 self.assertEqual('{0:x<6s}'.format('result'), 'result')
1022 self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
1023 self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
1024 self.assertEqual('{0: <7s}'.format('result'), 'result ')
1025 self.assertEqual('{0:<7s}'.format('result'), 'result ')
1026 self.assertEqual('{0:>7s}'.format('result'), ' result')
1027 self.assertEqual('{0:>8s}'.format('result'), ' result')
1028 self.assertEqual('{0:^8s}'.format('result'), ' result ')
1029 self.assertEqual('{0:^9s}'.format('result'), ' result ')
1030 self.assertEqual('{0:^10s}'.format('result'), ' result ')
1031 self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
1032 self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
1033 self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
1034
Eric V. Smith2ea97122014-04-14 11:55:10 -04001035 # issue 12546: use \x00 as a fill character
1036 self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
1037 self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
1038 self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
1039 self.assertEqual('{0:^6s}'.format('foo'), ' foo ')
1040
1041 self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
1042 self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
1043 self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
1044 self.assertEqual('{0:<6}'.format(3), '3 ')
1045
1046 self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
1047 self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
1048 self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
1049 self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
1050
1051 self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
1052 self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
1053 self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
1054 self.assertEqual('{0:^12}'.format(3+2.0j), ' (3+2j) ')
1055
Eric Smith8c663262007-08-25 02:26:07 +00001056 # format specifiers for user defined type
1057 self.assertEqual('{0:abc}'.format(C()), 'abc')
1058
Georg Brandld52429f2008-07-04 15:55:02 +00001059 # !r, !s and !a coercions
Eric Smith8c663262007-08-25 02:26:07 +00001060 self.assertEqual('{0!s}'.format('Hello'), 'Hello')
1061 self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
1062 self.assertEqual('{0!s:15}'.format('Hello'), 'Hello ')
1063 self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello ')
1064 self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
1065 self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
1066 self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +00001067 self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +00001068 self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'") # printable
1069 self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
Georg Brandl559e5d72008-06-11 18:37:52 +00001070 self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +00001071 self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +00001072 self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
Georg Brandl559e5d72008-06-11 18:37:52 +00001073 self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
1074 self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
Georg Brandld52429f2008-07-04 15:55:02 +00001075 self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
Eric Smith8c663262007-08-25 02:26:07 +00001076
Eric Smith8c663262007-08-25 02:26:07 +00001077 # test fallback to object.__format__
1078 self.assertEqual('{0}'.format({}), '{}')
1079 self.assertEqual('{0}'.format([]), '[]')
1080 self.assertEqual('{0}'.format([1]), '[1]')
Eric Smithe4d63172010-09-13 20:48:43 +00001081
Eric Smith8c663262007-08-25 02:26:07 +00001082 self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
Eric Smith8c663262007-08-25 02:26:07 +00001083 self.assertEqual('{0!s}'.format(G('data')), 'string is data')
1084
Andrew Svetlov2cd8ce42012-12-23 14:27:17 +02001085 self.assertRaises(TypeError, '{0:^10}'.format, E('data'))
1086 self.assertRaises(TypeError, '{0:^10s}'.format, E('data'))
1087 self.assertRaises(TypeError, '{0:>15s}'.format, G('data'))
Eric Smithe4d63172010-09-13 20:48:43 +00001088
Eric Smith739e2ad2007-08-27 19:07:22 +00001089 self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
1090 month=8,
1091 day=27)),
1092 "date: 2007-08-27")
1093
Eric Smith185e30c2007-08-30 22:23:08 +00001094 # test deriving from a builtin type and overriding __format__
1095 self.assertEqual("{0}".format(J(10)), "20")
1096
1097
Eric Smith8c663262007-08-25 02:26:07 +00001098 # string format specifiers
1099 self.assertEqual('{0:}'.format('a'), 'a')
1100
1101 # computed format specifiers
1102 self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
1103 self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
1104 self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
1105 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello ')
1106 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello ')
1107
1108 # test various errors
1109 self.assertRaises(ValueError, '{'.format)
1110 self.assertRaises(ValueError, '}'.format)
1111 self.assertRaises(ValueError, 'a{'.format)
1112 self.assertRaises(ValueError, 'a}'.format)
1113 self.assertRaises(ValueError, '{a'.format)
1114 self.assertRaises(ValueError, '}a'.format)
Eric Smith11529192007-09-04 23:04:22 +00001115 self.assertRaises(IndexError, '{0}'.format)
1116 self.assertRaises(IndexError, '{1}'.format, 'abc')
1117 self.assertRaises(KeyError, '{x}'.format)
Eric Smith8c663262007-08-25 02:26:07 +00001118 self.assertRaises(ValueError, "}{".format)
Eric Smith8c663262007-08-25 02:26:07 +00001119 self.assertRaises(ValueError, "abc{0:{}".format)
1120 self.assertRaises(ValueError, "{0".format)
Eric Smith11529192007-09-04 23:04:22 +00001121 self.assertRaises(IndexError, "{0.}".format)
1122 self.assertRaises(ValueError, "{0.}".format, 0)
Benjamin Peterson4d944742013-05-17 18:22:31 -05001123 self.assertRaises(ValueError, "{0[}".format)
Eric Smith4cb4e4e2007-09-03 08:40:29 +00001124 self.assertRaises(ValueError, "{0[}".format, [])
Eric Smith11529192007-09-04 23:04:22 +00001125 self.assertRaises(KeyError, "{0]}".format)
1126 self.assertRaises(ValueError, "{0.[]}".format, 0)
Eric Smith7ade6482007-08-26 22:27:13 +00001127 self.assertRaises(ValueError, "{0..foo}".format, 0)
Eric Smith11529192007-09-04 23:04:22 +00001128 self.assertRaises(ValueError, "{0[0}".format, 0)
1129 self.assertRaises(ValueError, "{0[0:foo}".format, 0)
1130 self.assertRaises(KeyError, "{c]}".format)
1131 self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
1132 self.assertRaises(ValueError, "{0}}".format, 0)
1133 self.assertRaises(KeyError, "{foo}".format, bar=3)
Eric Smith8c663262007-08-25 02:26:07 +00001134 self.assertRaises(ValueError, "{0!x}".format, 3)
Eric Smith11529192007-09-04 23:04:22 +00001135 self.assertRaises(ValueError, "{0!}".format, 0)
1136 self.assertRaises(ValueError, "{0!rs}".format, 0)
Eric Smith8c663262007-08-25 02:26:07 +00001137 self.assertRaises(ValueError, "{!}".format)
Eric Smith8ec90442009-03-14 12:29:34 +00001138 self.assertRaises(IndexError, "{:}".format)
1139 self.assertRaises(IndexError, "{:s}".format)
1140 self.assertRaises(IndexError, "{}".format)
Benjamin Peterson59a1b2f2010-06-07 22:31:26 +00001141 big = "23098475029384702983476098230754973209482573"
1142 self.assertRaises(ValueError, ("{" + big + "}").format)
1143 self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
Eric Smith8c663262007-08-25 02:26:07 +00001144
Eric Smith41669ca2009-05-23 14:23:22 +00001145 # issue 6089
1146 self.assertRaises(ValueError, "{0[0]x}".format, [None])
1147 self.assertRaises(ValueError, "{0[0](10)}".format, [None])
1148
Eric Smith8c663262007-08-25 02:26:07 +00001149 # can't have a replacement on the field name portion
1150 self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
1151
1152 # exceed maximum recursion depth
1153 self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
1154 self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1155 0, 1, 2, 3, 4, 5, 6, 7)
1156
1157 # string format spec errors
1158 self.assertRaises(ValueError, "{0:-s}".format, '')
1159 self.assertRaises(ValueError, format, "", "-")
1160 self.assertRaises(ValueError, "{0:=s}".format, '')
1161
Eric Smithb1ebcc62008-07-15 13:02:41 +00001162 # Alternate formatting is not supported
1163 self.assertRaises(ValueError, format, '', '#')
1164 self.assertRaises(ValueError, format, '', '#20')
1165
Victor Stinnerece58de2012-04-23 23:36:38 +02001166 # Non-ASCII
1167 self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"),
1168 'ABC\u0410\u0411\u0412')
1169 self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"),
1170 'ABC')
1171 self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"),
1172 '')
1173
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001174 self.assertEqual("{[{}]}".format({"{}": 5}), "5")
Benjamin Peterson4d944742013-05-17 18:22:31 -05001175 self.assertEqual("{[{}]}".format({"{}" : "a"}), "a")
1176 self.assertEqual("{[{]}".format({"{" : "a"}), "a")
1177 self.assertEqual("{[}]}".format({"}" : "a"}), "a")
1178 self.assertEqual("{[[]}".format({"[" : "a"}), "a")
1179 self.assertEqual("{[!]}".format({"!" : "a"}), "a")
1180 self.assertRaises(ValueError, "{a{}b}".format, 42)
1181 self.assertRaises(ValueError, "{a{b}".format, 42)
1182 self.assertRaises(ValueError, "{[}".format, 42)
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001183
Benjamin Peterson0ee22bf2013-11-26 19:22:36 -06001184 self.assertEqual("0x{:0{:d}X}".format(0x0,16), "0x0000000000000000")
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001185
Eric Smith27bbca62010-11-04 17:06:58 +00001186 def test_format_map(self):
1187 self.assertEqual(''.format_map({}), '')
1188 self.assertEqual('a'.format_map({}), 'a')
1189 self.assertEqual('ab'.format_map({}), 'ab')
1190 self.assertEqual('a{{'.format_map({}), 'a{')
1191 self.assertEqual('a}}'.format_map({}), 'a}')
1192 self.assertEqual('{{b'.format_map({}), '{b')
1193 self.assertEqual('}}b'.format_map({}), '}b')
1194 self.assertEqual('a{{b'.format_map({}), 'a{b')
1195
1196 # using mappings
1197 class Mapping(dict):
1198 def __missing__(self, key):
1199 return key
1200 self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
1201 self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
1202
1203 class InternalMapping:
1204 def __init__(self):
1205 self.mapping = {'a': 'hello'}
1206 def __getitem__(self, key):
1207 return self.mapping[key]
1208 self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
1209
1210
Eric Smith27bbca62010-11-04 17:06:58 +00001211 class C:
1212 def __init__(self, x=100):
1213 self._x = x
1214 def __format__(self, spec):
1215 return spec
Eric Smith27bbca62010-11-04 17:06:58 +00001216 self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
1217
1218 # test various errors
Eric V. Smithedbb6ca2012-03-12 15:16:22 -07001219 self.assertRaises(TypeError, ''.format_map)
1220 self.assertRaises(TypeError, 'a'.format_map)
1221
1222 self.assertRaises(ValueError, '{'.format_map, {})
1223 self.assertRaises(ValueError, '}'.format_map, {})
1224 self.assertRaises(ValueError, 'a{'.format_map, {})
1225 self.assertRaises(ValueError, 'a}'.format_map, {})
1226 self.assertRaises(ValueError, '{a'.format_map, {})
1227 self.assertRaises(ValueError, '}a'.format_map, {})
Eric Smith27bbca62010-11-04 17:06:58 +00001228
Eric V. Smith12ebefc2011-07-18 14:03:41 -04001229 # issue #12579: can't supply positional params to format_map
1230 self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
1231 self.assertRaises(ValueError, '{}'.format_map, 'a')
1232 self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
1233
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001234 def test_format_huge_precision(self):
1235 format_string = ".{}f".format(sys.maxsize + 1)
1236 with self.assertRaises(ValueError):
1237 result = format(2.34, format_string)
1238
1239 def test_format_huge_width(self):
1240 format_string = "{}f".format(sys.maxsize + 1)
1241 with self.assertRaises(ValueError):
1242 result = format(2.34, format_string)
1243
1244 def test_format_huge_item_number(self):
1245 format_string = "{{{}:.6f}}".format(sys.maxsize + 1)
1246 with self.assertRaises(ValueError):
1247 result = format_string.format(2.34)
1248
Eric Smith8ec90442009-03-14 12:29:34 +00001249 def test_format_auto_numbering(self):
1250 class C:
1251 def __init__(self, x=100):
1252 self._x = x
1253 def __format__(self, spec):
1254 return spec
1255
1256 self.assertEqual('{}'.format(10), '10')
1257 self.assertEqual('{:5}'.format('s'), 's ')
1258 self.assertEqual('{!r}'.format('s'), "'s'")
1259 self.assertEqual('{._x}'.format(C(10)), '10')
1260 self.assertEqual('{[1]}'.format([1, 2]), '2')
1261 self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
1262 self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
1263
1264 self.assertEqual('a{:{}}b'.format('x', '^10'), 'a x b')
1265 self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
1266
1267 # can't mix and match numbering and auto-numbering
1268 self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
1269 self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
1270 self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
1271 self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
1272
1273 # can mix and match auto-numbering and named
1274 self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
1275 self.assertEqual('{}{f}'.format(4, f='test'), '4test')
1276 self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
1277 self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
1278
Walter Dörwald28256f22003-01-19 16:59:20 +00001279 def test_formatting(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +00001280 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
Walter Dörwald28256f22003-01-19 16:59:20 +00001281 # Testing Unicode formatting strings...
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001282 self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
1283 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000, 3.00')
1284 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000, 3.00')
1285 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000, 3.50')
1286 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000, 3.57')
1287 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
Walter Dörwald28256f22003-01-19 16:59:20 +00001288 if not sys.platform.startswith('java'):
Walter Dörwald67e83882007-05-05 12:26:27 +00001289 self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
Georg Brandl559e5d72008-06-11 18:37:52 +00001290 self.assertEqual("%r" % ("\u1234",), "'\u1234'")
1291 self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001292 self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
1293 self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
Walter Dörwald56fbcb52003-03-31 18:18:41 +00001294
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001295 self.assertEqual('%c' % 0x1234, '\u1234')
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00001296 self.assertEqual('%c' % 0x21483, '\U00021483')
1297 self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
1298 self.assertEqual('%c' % '\U00021483', '\U00021483')
1299 self.assertRaises(TypeError, "%c".__mod__, "aa")
Stefan Krah99212f62010-07-19 17:58:26 +00001300 self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -07001301 self.assertRaises(TypeError, "%i".__mod__, "aa")
Walter Dörwald28256f22003-01-19 16:59:20 +00001302
1303 # formatting jobs delegated from the string implementation:
Walter Dörwald28256f22003-01-19 16:59:20 +00001304 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001305 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1306 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1307 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1308 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1309 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1310 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
1311 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
1312 self.assertEqual('...%s...' % "abc", '...abc...')
1313 self.assertEqual('%*s' % (5,'abc',), ' abc')
1314 self.assertEqual('%*s' % (-5,'abc',), 'abc ')
1315 self.assertEqual('%*.*s' % (5,2,'abc',), ' ab')
1316 self.assertEqual('%*.*s' % (5,3,'abc',), ' abc')
1317 self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10 abc')
1318 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103 abc')
1319 self.assertEqual('%c' % 'a', 'a')
Neil Schemenauercf52c072005-08-12 17:34:58 +00001320 class Wrapper:
1321 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001322 return '\u1234'
1323 self.assertEqual('%s' % Wrapper(), '\u1234')
Walter Dörwald28256f22003-01-19 16:59:20 +00001324
Eric Smith741191f2009-05-06 13:08:15 +00001325 # issue 3382
1326 NAN = float('nan')
1327 INF = float('inf')
1328 self.assertEqual('%f' % NAN, 'nan')
1329 self.assertEqual('%F' % NAN, 'NAN')
1330 self.assertEqual('%f' % INF, 'inf')
1331 self.assertEqual('%F' % INF, 'INF')
1332
Victor Stinnerf59c28c2012-05-09 03:24:14 +02001333 # PEP 393
1334 self.assertEqual('%.1s' % "a\xe9\u20ac", 'a')
1335 self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9')
1336
Ethan Furmandf3ed242014-01-05 06:50:30 -08001337 #issue 19995
Ethan Furman9ab74802014-03-21 06:38:46 -07001338 class PseudoInt:
Ethan Furmandf3ed242014-01-05 06:50:30 -08001339 def __init__(self, value):
1340 self.value = int(value)
1341 def __int__(self):
1342 return self.value
1343 def __index__(self):
1344 return self.value
Ethan Furman9ab74802014-03-21 06:38:46 -07001345 class PseudoFloat:
Ethan Furmandf3ed242014-01-05 06:50:30 -08001346 def __init__(self, value):
1347 self.value = float(value)
1348 def __int__(self):
1349 return int(self.value)
Ethan Furman9ab74802014-03-21 06:38:46 -07001350 pi = PseudoFloat(3.1415)
1351 letter_m = PseudoInt(109)
Antoine Pitroueb168042014-01-10 00:02:38 +01001352 self.assertEqual('%x' % 42, '2a')
1353 self.assertEqual('%X' % 15, 'F')
1354 self.assertEqual('%o' % 9, '11')
1355 self.assertEqual('%c' % 109, 'm')
1356 self.assertEqual('%x' % letter_m, '6d')
1357 self.assertEqual('%X' % letter_m, '6D')
1358 self.assertEqual('%o' % letter_m, '155')
1359 self.assertEqual('%c' % letter_m, 'm')
Ethan Furman9ab74802014-03-21 06:38:46 -07001360 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14),
1361 self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11),
1362 self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79),
1363 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi),
1364 self.assertRaises(TypeError, operator.mod, '%c', pi),
Ethan Furmandf3ed242014-01-05 06:50:30 -08001365
Ethan Furmanfb137212013-08-31 10:18:55 -07001366 def test_formatting_with_enum(self):
1367 # issue18780
1368 import enum
1369 class Float(float, enum.Enum):
1370 PI = 3.1415926
1371 class Int(enum.IntEnum):
1372 IDES = 15
1373 class Str(str, enum.Enum):
1374 ABC = 'abc'
1375 # Testing Unicode formatting strings...
Ethan Furman13bdfa72013-08-31 12:48:51 -07001376 self.assertEqual("%s, %s" % (Str.ABC, Str.ABC),
1377 'Str.ABC, Str.ABC')
1378 self.assertEqual("%s, %s, %d, %i, %u, %f, %5.2f" %
1379 (Str.ABC, Str.ABC,
1380 Int.IDES, Int.IDES, Int.IDES,
1381 Float.PI, Float.PI),
1382 'Str.ABC, Str.ABC, 15, 15, 15, 3.141593, 3.14')
Ethan Furmanfb137212013-08-31 10:18:55 -07001383
1384 # formatting jobs delegated from the string implementation:
Ethan Furman13bdfa72013-08-31 12:48:51 -07001385 self.assertEqual('...%(foo)s...' % {'foo':Str.ABC},
1386 '...Str.ABC...')
1387 self.assertEqual('...%(foo)s...' % {'foo':Int.IDES},
1388 '...Int.IDES...')
1389 self.assertEqual('...%(foo)i...' % {'foo':Int.IDES},
1390 '...15...')
1391 self.assertEqual('...%(foo)d...' % {'foo':Int.IDES},
1392 '...15...')
1393 self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI},
1394 '...15...')
1395 self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123},
1396 '...3.141593...')
Ethan Furmanfb137212013-08-31 10:18:55 -07001397
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001398 def test_formatting_huge_precision(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001399 format_string = "%.{}f".format(sys.maxsize + 1)
1400 with self.assertRaises(ValueError):
1401 result = format_string % 2.34
1402
1403 @support.cpython_only
1404 def test_formatting_huge_precision_c_limits(self):
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001405 from _testcapi import INT_MAX
1406 format_string = "%.{}f".format(INT_MAX + 1)
1407 with self.assertRaises(ValueError):
1408 result = format_string % 2.34
1409
1410 def test_formatting_huge_width(self):
1411 format_string = "%{}f".format(sys.maxsize + 1)
1412 with self.assertRaises(ValueError):
1413 result = format_string % 2.34
1414
Ezio Melottiba42fd52011-04-26 06:09:45 +03001415 def test_startswith_endswith_errors(self):
1416 for meth in ('foo'.startswith, 'foo'.endswith):
Ezio Melottif2b3f782011-04-26 06:40:59 +03001417 with self.assertRaises(TypeError) as cm:
Ezio Melottiba42fd52011-04-26 06:09:45 +03001418 meth(['f'])
Ezio Melottif2b3f782011-04-26 06:40:59 +03001419 exc = str(cm.exception)
Ezio Melottiba42fd52011-04-26 06:09:45 +03001420 self.assertIn('str', exc)
1421 self.assertIn('tuple', exc)
1422
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001423 @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
Georg Brandlda6b1072006-01-20 17:48:54 +00001424 def test_format_float(self):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001425 # should not format with a comma, but always with C locale
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001426 self.assertEqual('1.0', '%.1f' % 1.0)
Georg Brandlda6b1072006-01-20 17:48:54 +00001427
Walter Dörwald28256f22003-01-19 16:59:20 +00001428 def test_constructor(self):
1429 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1430
1431 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001432 str('unicode remains unicode'),
1433 'unicode remains unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001434 )
1435
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001436 class UnicodeSubclass(str):
Marc-André Lemburg79f57832002-12-29 19:44:06 +00001437 pass
Guido van Rossuma831cac2000-03-10 23:23:21 +00001438
Victor Stinner07ac3eb2011-10-01 16:16:43 +02001439 for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
1440 subclass = UnicodeSubclass(text)
1441 self.assertEqual(str(subclass), text)
1442 self.assertEqual(len(subclass), len(text))
1443 if text == 'ascii':
1444 self.assertEqual(subclass.encode('ascii'), b'ascii')
1445 self.assertEqual(subclass.encode('utf-8'), b'ascii')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001446
Walter Dörwald28256f22003-01-19 16:59:20 +00001447 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001448 str('strings are converted to unicode'),
1449 'strings are converted to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001450 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001451
Walter Dörwald28256f22003-01-19 16:59:20 +00001452 class StringCompat:
1453 def __init__(self, x):
1454 self.x = x
1455 def __str__(self):
1456 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001457
Walter Dörwald28256f22003-01-19 16:59:20 +00001458 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001459 str(StringCompat('__str__ compatible objects are recognized')),
1460 '__str__ compatible objects are recognized'
Walter Dörwald28256f22003-01-19 16:59:20 +00001461 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001462
Walter Dörwald28256f22003-01-19 16:59:20 +00001463 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001464
Walter Dörwald28256f22003-01-19 16:59:20 +00001465 o = StringCompat('unicode(obj) is compatible to str()')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001466 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Walter Dörwald28256f22003-01-19 16:59:20 +00001467 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001468
Guido van Rossume2a383d2007-01-15 16:59:06 +00001469 for obj in (123, 123.45, 123):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001470 self.assertEqual(str(obj), str(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001471
Walter Dörwald28256f22003-01-19 16:59:20 +00001472 # unicode(obj, encoding, error) tests (this maps to
1473 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001474
Walter Dörwald28256f22003-01-19 16:59:20 +00001475 if not sys.platform.startswith('java'):
1476 self.assertRaises(
1477 TypeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001478 str,
1479 'decoding unicode is not supported',
Walter Dörwald28256f22003-01-19 16:59:20 +00001480 'utf-8',
1481 'strict'
1482 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001483
Walter Dörwald28256f22003-01-19 16:59:20 +00001484 self.assertEqual(
Walter Dörwald67e83882007-05-05 12:26:27 +00001485 str(b'strings are decoded to unicode', 'utf-8', 'strict'),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001486 'strings are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001487 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001488
Walter Dörwald28256f22003-01-19 16:59:20 +00001489 if not sys.platform.startswith('java'):
1490 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001491 str(
Guido van Rossumbae07c92007-10-08 02:46:15 +00001492 memoryview(b'character buffers are decoded to unicode'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001493 'utf-8',
1494 'strict'
1495 ),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001496 'character buffers are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001497 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001498
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001499 self.assertRaises(TypeError, str, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001500
Chris Jerdonek5fae0e52012-11-20 17:45:51 -08001501 def test_constructor_keyword_args(self):
1502 """Pass various keyword argument combinations to the constructor."""
1503 # The object argument can be passed as a keyword.
1504 self.assertEqual(str(object='foo'), 'foo')
1505 self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo')
1506 # The errors argument without encoding triggers "decode" mode.
1507 self.assertEqual(str(b'foo', errors='strict'), 'foo') # not "b'foo'"
1508 self.assertEqual(str(object=b'foo', errors='strict'), 'foo')
1509
1510 def test_constructor_defaults(self):
1511 """Check the constructor argument defaults."""
1512 # The object argument defaults to '' or b''.
1513 self.assertEqual(str(), '')
1514 self.assertEqual(str(errors='strict'), '')
1515 utf8_cent = '¢'.encode('utf-8')
1516 # The encoding argument defaults to utf-8.
1517 self.assertEqual(str(utf8_cent, errors='strict'), '¢')
1518 # The errors argument defaults to strict.
1519 self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii')
1520
Walter Dörwald28256f22003-01-19 16:59:20 +00001521 def test_codecs_utf7(self):
1522 utfTests = [
Walter Dörwald67e83882007-05-05 12:26:27 +00001523 ('A\u2262\u0391.', b'A+ImIDkQ.'), # RFC2152 example
1524 ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'), # RFC2152 example
1525 ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'), # RFC2152 example
1526 ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1527 ('+', b'+-'),
1528 ('+-', b'+--'),
1529 ('+?', b'+-?'),
1530 ('\?', b'+AFw?'),
1531 ('+?', b'+-?'),
1532 (r'\\?', b'+AFwAXA?'),
1533 (r'\\\?', b'+AFwAXABc?'),
Antoine Pitrou244651a2009-05-04 18:56:13 +00001534 (r'++--', b'+-+---'),
1535 ('\U000abcde', b'+2m/c3g-'), # surrogate pairs
1536 ('/', b'/'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001537 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001538
Walter Dörwald28256f22003-01-19 16:59:20 +00001539 for (x, y) in utfTests:
1540 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001541
Antoine Pitrou5418ee02011-11-15 01:42:21 +01001542 # Unpaired surrogates are passed through
1543 self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
1544 self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
1545 self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
1546 self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
1547 self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
1548 self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
1549 self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
1550 self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001551
Antoine Pitrou5418ee02011-11-15 01:42:21 +01001552 self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
1553 self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001554
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001555 # Issue #2242: crash on some Windows/MSVC versions
Antoine Pitrou244651a2009-05-04 18:56:13 +00001556 self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1')
1557
1558 # Direct encoded characters
1559 set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1560 # Optional direct characters
1561 set_o = '!"#$%&*;<=>@[]^_`{|}'
1562 for c in set_d:
1563 self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1564 self.assertEqual(c.encode('ascii').decode('utf7'), c)
1565 for c in set_o:
1566 self.assertEqual(c.encode('ascii').decode('utf7'), c)
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001567
Walter Dörwald28256f22003-01-19 16:59:20 +00001568 def test_codecs_utf8(self):
Walter Dörwald67e83882007-05-05 12:26:27 +00001569 self.assertEqual(''.encode('utf-8'), b'')
1570 self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
Ezio Melottia9860ae2011-10-04 19:06:00 +03001571 self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82')
1572 self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96')
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001573 self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1574 self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
Ezio Melottia9860ae2011-10-04 19:06:00 +03001575 self.assertEqual(('\U00010002'*10).encode('utf-8'),
1576 b'\xf0\x90\x80\x82'*10)
Walter Dörwald28256f22003-01-19 16:59:20 +00001577 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001578 '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1579 '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1580 '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1581 '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1582 '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1583 ' Nunstuck git und'.encode('utf-8'),
Walter Dörwald67e83882007-05-05 12:26:27 +00001584 b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1585 b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1586 b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1587 b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1588 b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1589 b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1590 b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1591 b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1592 b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1593 b'\xe3\x80\x8cWenn ist das Nunstuck git und'
Walter Dörwald28256f22003-01-19 16:59:20 +00001594 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001595
Walter Dörwald28256f22003-01-19 16:59:20 +00001596 # UTF-8 specific decoding tests
Walter Dörwald67e83882007-05-05 12:26:27 +00001597 self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1598 self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1599 self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001600
Walter Dörwald28256f22003-01-19 16:59:20 +00001601 # Other possible utf-8 test cases:
1602 # * strict decoding testing for all of the
1603 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001604
Ezio Melotti57221d02010-07-01 07:32:02 +00001605 def test_utf8_decode_valid_sequences(self):
1606 sequences = [
1607 # single byte
1608 (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1609 # 2 bytes
1610 (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1611 # 3 bytes
1612 (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1613 (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1614 # 4 bytes
1615 (b'\xF0\x90\x80\x80', '\U00010000'),
1616 (b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1617 ]
1618 for seq, res in sequences:
1619 self.assertEqual(seq.decode('utf-8'), res)
1620
1621
1622 def test_utf8_decode_invalid_sequences(self):
1623 # continuation bytes in a sequence of 2, 3, or 4 bytes
1624 continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001625 # start bytes of a 2-byte sequence equivalent to code points < 0x7F
Ezio Melotti57221d02010-07-01 07:32:02 +00001626 invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001627 # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
Ezio Melotti57221d02010-07-01 07:32:02 +00001628 invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1629 invalid_start_bytes = (
1630 continuation_bytes + invalid_2B_seq_start_bytes +
1631 invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1632 )
1633
1634 for byte in invalid_start_bytes:
1635 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1636
1637 for sb in invalid_2B_seq_start_bytes:
1638 for cb in continuation_bytes:
1639 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1640
1641 for sb in invalid_4B_seq_start_bytes:
1642 for cb1 in continuation_bytes[:3]:
1643 for cb3 in continuation_bytes[:3]:
1644 self.assertRaises(UnicodeDecodeError,
1645 (sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1646
1647 for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1648 self.assertRaises(UnicodeDecodeError,
1649 (b'\xE0'+cb+b'\x80').decode, 'utf-8')
1650 self.assertRaises(UnicodeDecodeError,
1651 (b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1652 # surrogates
1653 for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1654 self.assertRaises(UnicodeDecodeError,
1655 (b'\xED'+cb+b'\x80').decode, 'utf-8')
1656 self.assertRaises(UnicodeDecodeError,
1657 (b'\xED'+cb+b'\xBF').decode, 'utf-8')
1658 for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1659 self.assertRaises(UnicodeDecodeError,
1660 (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1661 self.assertRaises(UnicodeDecodeError,
1662 (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1663 for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1664 self.assertRaises(UnicodeDecodeError,
1665 (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1666 self.assertRaises(UnicodeDecodeError,
1667 (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1668
1669 def test_issue8271(self):
1670 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1671 # only the start byte and the continuation byte(s) are now considered
1672 # invalid, instead of the number of bytes specified by the start byte.
1673 # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1674 # table 3-8, Row 2) for more information about the algorithm used.
1675 FFFD = '\ufffd'
1676 sequences = [
1677 # invalid start bytes
1678 (b'\x80', FFFD), # continuation byte
1679 (b'\x80\x80', FFFD*2), # 2 continuation bytes
1680 (b'\xc0', FFFD),
1681 (b'\xc0\xc0', FFFD*2),
1682 (b'\xc1', FFFD),
1683 (b'\xc1\xc0', FFFD*2),
1684 (b'\xc0\xc1', FFFD*2),
1685 # with start byte of a 2-byte sequence
1686 (b'\xc2', FFFD), # only the start byte
1687 (b'\xc2\xc2', FFFD*2), # 2 start bytes
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001688 (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes
Ezio Melotti57221d02010-07-01 07:32:02 +00001689 (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1690 # with start byte of a 3-byte sequence
1691 (b'\xe1', FFFD), # only the start byte
1692 (b'\xe1\xe1', FFFD*2), # 2 start bytes
1693 (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1694 (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1695 (b'\xe1\x80', FFFD), # only 1 continuation byte
1696 (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1697 (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1698 (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1699 (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1700 (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1701 (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1702 # with start byte of a 4-byte sequence
1703 (b'\xf1', FFFD), # only the start byte
1704 (b'\xf1\xf1', FFFD*2), # 2 start bytes
1705 (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1706 (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1707 (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1708 (b'\xf1\x80', FFFD), # only 1 continuation bytes
1709 (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1710 (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1711 (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1712 (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1713 (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1714 (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1715 (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1716 (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1717 (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1718 (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1719 (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1720 (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1721 # with invalid start byte of a 4-byte sequence (rfc2279)
1722 (b'\xf5', FFFD), # only the start byte
1723 (b'\xf5\xf5', FFFD*2), # 2 start bytes
1724 (b'\xf5\x80', FFFD*2), # only 1 continuation byte
1725 (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1726 (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1727 (b'\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
1728 (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1729 (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1730 # with invalid start byte of a 5-byte sequence (rfc2279)
1731 (b'\xf8', FFFD), # only the start byte
1732 (b'\xf8\xf8', FFFD*2), # 2 start bytes
1733 (b'\xf8\x80', FFFD*2), # only one continuation byte
1734 (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1735 (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1736 # with invalid start byte of a 6-byte sequence (rfc2279)
1737 (b'\xfc', FFFD), # only the start byte
1738 (b'\xfc\xfc', FFFD*2), # 2 start bytes
1739 (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1740 (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1741 # invalid start byte
1742 (b'\xfe', FFFD),
1743 (b'\xfe\x80\x80', FFFD*3),
1744 # other sequences
1745 (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1746 (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1747 (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1748 (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1749 '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1750 ]
1751 for n, (seq, res) in enumerate(sequences):
1752 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1753 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1754 self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1755 self.assertEqual(seq.decode('utf-8', 'ignore'),
1756 res.replace('\uFFFD', ''))
1757
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001758 def to_bytestring(self, seq):
1759 return bytes(int(c, 16) for c in seq.split())
1760
1761 def assertCorrectUTF8Decoding(self, seq, res, err):
1762 """
1763 Check that an invalid UTF-8 sequence raises an UnicodeDecodeError when
1764 'strict' is used, returns res when 'replace' is used, and that doesn't
1765 return anything when 'ignore' is used.
1766 """
1767 with self.assertRaises(UnicodeDecodeError) as cm:
1768 seq.decode('utf-8')
1769 exc = cm.exception
1770
1771 self.assertIn(err, str(exc))
1772 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1773 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'),
1774 'aaaa' + res + 'bbbb')
1775 res = res.replace('\ufffd', '')
1776 self.assertEqual(seq.decode('utf-8', 'ignore'), res)
1777 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'),
1778 'aaaa' + res + 'bbbb')
1779
1780 def test_invalid_start_byte(self):
1781 """
1782 Test that an 'invalid start byte' error is raised when the first byte
1783 is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
1784 4-bytes sequence. The invalid start byte is replaced with a single
1785 U+FFFD when errors='replace'.
1786 E.g. <80> is a continuation byte and can appear only after a start byte.
1787 """
1788 FFFD = '\ufffd'
1789 for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
1790 self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd',
1791 'invalid start byte')
1792
1793 def test_unexpected_end_of_data(self):
1794 """
1795 Test that an 'unexpected end of data' error is raised when the string
1796 ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
1797 enough continuation bytes. The incomplete sequence is replaced with a
1798 single U+FFFD when errors='replace'.
1799 E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
1800 sequence, but it's followed by only 2 valid continuation bytes and the
1801 last continuation bytes is missing.
1802 Note: the continuation bytes must be all valid, if one of them is
1803 invalid another error will be raised.
1804 """
1805 sequences = [
1806 'C2', 'DF',
1807 'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
1808 'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
1809 'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
1810 'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
1811 'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
1812 'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
1813 ]
1814 FFFD = '\ufffd'
1815 for seq in sequences:
1816 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), '\ufffd',
1817 'unexpected end of data')
1818
1819 def test_invalid_cb_for_2bytes_seq(self):
1820 """
1821 Test that an 'invalid continuation byte' error is raised when the
1822 continuation byte of a 2-bytes sequence is invalid. The start byte
1823 is replaced by a single U+FFFD and the second byte is handled
1824 separately when errors='replace'.
1825 E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
1826 sequence, but 41 is not a valid continuation byte because it's the
1827 ASCII letter 'A'.
1828 """
1829 FFFD = '\ufffd'
1830 FFFDx2 = FFFD * 2
1831 sequences = [
1832 ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'),
1833 ('C2 C0', FFFDx2), ('C2 FF', FFFDx2),
1834 ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'),
1835 ('DF C0', FFFDx2), ('DF FF', FFFDx2),
1836 ]
1837 for seq, res in sequences:
1838 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
1839 'invalid continuation byte')
1840
1841 def test_invalid_cb_for_3bytes_seq(self):
1842 """
1843 Test that an 'invalid continuation byte' error is raised when the
1844 continuation byte(s) of a 3-bytes sequence are invalid. When
1845 errors='replace', if the first continuation byte is valid, the first
1846 two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
1847 third byte is handled separately, otherwise only the start byte is
1848 replaced with a U+FFFD and the other continuation bytes are handled
1849 separately.
1850 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1851 sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1852 because it's the ASCII letter 'A'.
1853 Note: when the start byte is E0 or ED, the valid ranges for the first
1854 continuation byte are limited to A0..BF and 80..9F respectively.
1855 Python 2 used to consider all the bytes in range 80..BF valid when the
1856 start byte was ED. This is fixed in Python 3.
1857 """
1858 FFFD = '\ufffd'
1859 FFFDx2 = FFFD * 2
1860 sequences = [
1861 ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2),
1862 ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
1863 ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'),
1864 ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
1865 ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'),
1866 ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'),
1867 ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
1868 ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'),
1869 ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
1870 ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'),
1871 ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'),
1872 ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
1873 ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'),
1874 ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
1875 ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'),
1876 ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'),
1877 ('ED 7F', FFFD+'\x7f'),
1878 ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
1879 ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'),
1880 ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2),
1881 ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'),
1882 ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2),
1883 ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'),
1884 ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
1885 ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'),
1886 ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
1887 ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'),
1888 ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'),
1889 ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
1890 ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'),
1891 ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
1892 ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'),
1893 ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
1894 ]
1895 for seq, res in sequences:
1896 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
1897 'invalid continuation byte')
1898
1899 def test_invalid_cb_for_4bytes_seq(self):
1900 """
1901 Test that an 'invalid continuation byte' error is raised when the
1902 continuation byte(s) of a 4-bytes sequence are invalid. When
1903 errors='replace',the start byte and all the following valid
1904 continuation bytes are replaced with a single U+FFFD, and all the bytes
1905 starting from the first invalid continuation bytes (included) are
1906 handled separately.
1907 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1908 sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1909 because it's the ASCII letter 'A'.
1910 Note: when the start byte is E0 or ED, the valid ranges for the first
1911 continuation byte are limited to A0..BF and 80..9F respectively.
1912 However, when the start byte is ED, Python 2 considers all the bytes
1913 in range 80..BF valid. This is fixed in Python 3.
1914 """
1915 FFFD = '\ufffd'
1916 FFFDx2 = FFFD * 2
1917 sequences = [
1918 ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2),
1919 ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
1920 ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'),
1921 ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
1922 ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'),
1923 ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
1924 ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'),
1925 ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
1926 ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'),
1927 ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
1928 ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'),
1929 ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
1930 ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'),
1931 ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
1932 ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2),
1933 ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'),
1934 ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2),
1935 ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'),
1936 ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2),
1937 ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'),
1938 ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2),
1939 ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'),
1940 ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2),
1941 ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'),
1942 ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2),
1943 ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'),
1944 ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2),
1945 ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'),
1946 ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
1947 ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'),
1948 ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
1949 ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'),
1950 ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
1951 ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'),
1952 ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
1953 ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'),
1954 ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
1955 ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'),
1956 ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
1957 ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'),
1958 ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
1959 ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2),
1960 ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
1961 ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'),
1962 ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
1963 ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'),
1964 ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
1965 ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'),
1966 ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
1967 ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'),
1968 ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
1969 ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'),
1970 ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
1971 ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'),
1972 ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
1973 ]
1974 for seq, res in sequences:
1975 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
1976 'invalid continuation byte')
1977
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00001978 def test_codecs_idna(self):
1979 # Test whether trailing dot is preserved
Walter Dörwald1324c6f2007-05-11 19:57:05 +00001980 self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00001981
Walter Dörwald28256f22003-01-19 16:59:20 +00001982 def test_codecs_errors(self):
1983 # Error handling (encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001984 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
1985 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
Walter Dörwald67e83882007-05-05 12:26:27 +00001986 self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
1987 self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
Benjamin Peterson308d6372009-09-18 21:42:35 +00001988 self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
1989 'Andr\202 x'.encode('ascii', errors='replace'))
1990 self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
1991 'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001992
Walter Dörwald28256f22003-01-19 16:59:20 +00001993 # Error handling (decoding)
Walter Dörwald67e83882007-05-05 12:26:27 +00001994 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
1995 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
1996 self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
1997 self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001998
Walter Dörwald28256f22003-01-19 16:59:20 +00001999 # Error handling (unknown character names)
Guido van Rossum39478e82007-08-27 17:23:59 +00002000 self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002001
Walter Dörwald28256f22003-01-19 16:59:20 +00002002 # Error handling (truncated escape sequence)
Guido van Rossum9c627722007-08-27 18:31:48 +00002003 self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00002004
Guido van Rossum9c627722007-08-27 18:31:48 +00002005 self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
2006 self.assertRaises(TypeError, str, b"hello", "test.unicode2")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002007 self.assertRaises(TypeError, "hello".encode, "test.unicode1")
2008 self.assertRaises(TypeError, "hello".encode, "test.unicode2")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00002009
Walter Dörwald28256f22003-01-19 16:59:20 +00002010 # Error handling (wrong arguments)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002011 self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002012
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00002013 # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00002014 self.assertRaises(UnicodeError, float, "\ud800")
2015 self.assertRaises(UnicodeError, float, "\udf00")
2016 self.assertRaises(UnicodeError, complex, "\ud800")
2017 self.assertRaises(UnicodeError, complex, "\udf00")
Guido van Rossum97064862000-04-10 13:52:48 +00002018
Walter Dörwald28256f22003-01-19 16:59:20 +00002019 def test_codecs(self):
2020 # Encoding
Walter Dörwald67e83882007-05-05 12:26:27 +00002021 self.assertEqual('hello'.encode('ascii'), b'hello')
2022 self.assertEqual('hello'.encode('utf-7'), b'hello')
2023 self.assertEqual('hello'.encode('utf-8'), b'hello')
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002024 self.assertEqual('hello'.encode('utf-8'), b'hello')
Walter Dörwald67e83882007-05-05 12:26:27 +00002025 self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
2026 self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
2027 self.assertEqual('hello'.encode('latin-1'), b'hello')
Guido van Rossum97064862000-04-10 13:52:48 +00002028
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002029 # Default encoding is utf-8
2030 self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
2031
Walter Dörwald28256f22003-01-19 16:59:20 +00002032 # Roundtrip safety for BMP (just the first 1024 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002033 for c in range(1024):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002034 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002035 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
2036 'utf-16-be', 'raw_unicode_escape',
2037 'unicode_escape', 'unicode_internal'):
Victor Stinner040e16e2011-11-15 22:44:05 +01002038 with warnings.catch_warnings():
2039 # unicode-internal has been deprecated
2040 warnings.simplefilter("ignore", DeprecationWarning)
2041
2042 self.assertEqual(str(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002043
Walter Dörwald28256f22003-01-19 16:59:20 +00002044 # Roundtrip safety for BMP (just the first 256 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002045 for c in range(256):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002046 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002047 for encoding in ('latin-1',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002048 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002049
Walter Dörwald28256f22003-01-19 16:59:20 +00002050 # Roundtrip safety for BMP (just the first 128 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00002051 for c in range(128):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00002052 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002053 for encoding in ('ascii',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002054 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002055
Walter Dörwald28256f22003-01-19 16:59:20 +00002056 # Roundtrip safety for non-BMP (just a few chars)
Victor Stinner040e16e2011-11-15 22:44:05 +01002057 with warnings.catch_warnings():
2058 # unicode-internal has been deprecated
2059 warnings.simplefilter("ignore", DeprecationWarning)
2060
2061 u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
2062 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
2063 'raw_unicode_escape',
2064 'unicode_escape', 'unicode_internal'):
2065 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00002066
Antoine Pitrou51f66482011-11-11 13:35:44 +01002067 # UTF-8 must be roundtrip safe for all code points
2068 # (except surrogates, which are forbidden).
2069 u = ''.join(map(chr, list(range(0, 0xd800)) +
Ezio Melotti40dc9192011-11-11 17:00:46 +02002070 list(range(0xe000, 0x110000))))
Walter Dörwald28256f22003-01-19 16:59:20 +00002071 for encoding in ('utf-8',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002072 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002073
Walter Dörwald28256f22003-01-19 16:59:20 +00002074 def test_codecs_charmap(self):
2075 # 0-127
Guido van Rossum805365e2007-05-07 22:24:25 +00002076 s = bytes(range(128))
Walter Dörwald28256f22003-01-19 16:59:20 +00002077 for encoding in (
Andrew Kuchlingad8156e2013-11-10 13:44:30 -05002078 'cp037', 'cp1026', 'cp273',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00002079 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2080 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02002081 'cp863', 'cp865', 'cp866', 'cp1125',
Walter Dörwald28256f22003-01-19 16:59:20 +00002082 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2083 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
2084 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
2085 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002086
Walter Dörwald28256f22003-01-19 16:59:20 +00002087 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2088 'cp1256', 'cp1257', 'cp1258',
2089 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002090
Walter Dörwald28256f22003-01-19 16:59:20 +00002091 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2092 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +00002093
Walter Dörwald28256f22003-01-19 16:59:20 +00002094 ### These have undefined mappings:
2095 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +00002096
Walter Dörwald28256f22003-01-19 16:59:20 +00002097 ### These fail the round-trip:
2098 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +00002099
Walter Dörwald28256f22003-01-19 16:59:20 +00002100 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002101 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002102
Walter Dörwald28256f22003-01-19 16:59:20 +00002103 # 128-255
Guido van Rossum805365e2007-05-07 22:24:25 +00002104 s = bytes(range(128, 256))
Walter Dörwald28256f22003-01-19 16:59:20 +00002105 for encoding in (
Andrew Kuchlingad8156e2013-11-10 13:44:30 -05002106 'cp037', 'cp1026', 'cp273',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00002107 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2108 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02002109 'cp863', 'cp865', 'cp866', 'cp1125',
Walter Dörwald28256f22003-01-19 16:59:20 +00002110 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2111 'iso8859_2', 'iso8859_4', 'iso8859_5',
2112 'iso8859_9', 'koi8_r', 'latin_1',
2113 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +00002114
Walter Dörwald28256f22003-01-19 16:59:20 +00002115 ### These have undefined mappings:
2116 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2117 #'cp1256', 'cp1257', 'cp1258',
2118 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
2119 #'iso8859_3', 'iso8859_6', 'iso8859_7',
2120 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +00002121
Walter Dörwald28256f22003-01-19 16:59:20 +00002122 ### These fail the round-trip:
2123 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +00002124
Walter Dörwald28256f22003-01-19 16:59:20 +00002125 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002126 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00002127
Walter Dörwald28256f22003-01-19 16:59:20 +00002128 def test_concatenation(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002129 self.assertEqual(("abc" "def"), "abcdef")
2130 self.assertEqual(("abc" "def"), "abcdef")
2131 self.assertEqual(("abc" "def"), "abcdef")
2132 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2133 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +00002134
Walter Dörwald28256f22003-01-19 16:59:20 +00002135 def test_printing(self):
2136 class BitBucket:
2137 def write(self, text):
2138 pass
Fred Drake004d5e62000-10-23 17:22:08 +00002139
Walter Dörwald28256f22003-01-19 16:59:20 +00002140 out = BitBucket()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002141 print('abc', file=out)
2142 print('abc', 'def', file=out)
2143 print('abc', 'def', file=out)
2144 print('abc', 'def', file=out)
2145 print('abc\n', file=out)
2146 print('abc\n', end=' ', file=out)
2147 print('abc\n', end=' ', file=out)
2148 print('def\n', file=out)
2149 print('def\n', file=out)
Fred Drake004d5e62000-10-23 17:22:08 +00002150
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002151 def test_ucs4(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002152 x = '\U00100000'
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002153 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
2154 self.assertEqual(x, y)
2155
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002156 y = br'\U00100000'
2157 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2158 self.assertEqual(x, y)
2159 y = br'\U00010000'
2160 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2161 self.assertEqual(x, y)
Christian Heimesfe337bf2008-03-23 21:54:12 +00002162
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002163 try:
2164 br'\U11111111'.decode("raw-unicode-escape")
2165 except UnicodeDecodeError as e:
2166 self.assertEqual(e.start, 0)
2167 self.assertEqual(e.end, 10)
2168 else:
2169 self.fail("Should have raised UnicodeDecodeError")
Christian Heimesfe337bf2008-03-23 21:54:12 +00002170
Brett Cannonc3647ac2005-04-26 03:45:26 +00002171 def test_conversion(self):
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002172 # Make sure __str__() works properly
2173 class ObjectToStr:
Brett Cannonc3647ac2005-04-26 03:45:26 +00002174 def __str__(self):
2175 return "foo"
2176
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002177 class StrSubclassToStr(str):
Guido van Rossum98297ee2007-11-06 21:34:58 +00002178 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002179 return "foo"
Brett Cannonc3647ac2005-04-26 03:45:26 +00002180
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002181 class StrSubclassToStrSubclass(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002182 def __new__(cls, content=""):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002183 return str.__new__(cls, 2*content)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002184 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002185 return self
2186
Serhiy Storchakaa60c2fe2015-03-12 21:56:08 +02002187 self.assertEqual(str(ObjectToStr()), "foo")
2188 self.assertEqual(str(StrSubclassToStr("bar")), "foo")
2189 s = str(StrSubclassToStrSubclass("foo"))
2190 self.assertEqual(s, "foofoo")
2191 self.assertIs(type(s), StrSubclassToStrSubclass)
Brett Cannonc3647ac2005-04-26 03:45:26 +00002192
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002193 def test_unicode_repr(self):
2194 class s1:
2195 def __repr__(self):
2196 return '\\n'
2197
2198 class s2:
2199 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002200 return '\\n'
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002201
2202 self.assertEqual(repr(s1()), '\\n')
2203 self.assertEqual(repr(s2()), '\\n')
2204
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002205 def test_printable_repr(self):
2206 self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
Martin v. Löwisbaecd722010-10-11 22:42:28 +00002207 self.assertEqual(repr('\U00014000'), "'\\U00014000'") # nonprintable
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002208
Zachary Ware9fe6d862013-12-08 00:20:35 -06002209 # This test only affects 32-bit platforms because expandtabs can only take
2210 # an int as the max value, not a 64-bit C long. If expandtabs is changed
2211 # to take a 64-bit long, this test should apply to all platforms.
2212 @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4,
2213 'only applies to 32-bit platforms')
Guido van Rossumcd16bf62007-06-13 18:07:49 +00002214 def test_expandtabs_overflows_gracefully(self):
Christian Heimesa37d4c62007-12-04 23:02:19 +00002215 self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002216
Victor Stinner1d972ad2011-10-07 13:31:46 +02002217 @support.cpython_only
Antoine Pitroue19aa382011-10-04 16:04:01 +02002218 def test_expandtabs_optimization(self):
2219 s = 'abc'
2220 self.assertIs(s.expandtabs(), s)
2221
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00002222 def test_raiseMemError(self):
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 if struct.calcsize('P') == 8:
2224 # 64 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02002225 ascii_struct_size = 48
2226 compact_struct_size = 72
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 else:
2228 # 32 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02002229 ascii_struct_size = 24
2230 compact_struct_size = 36
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231
2232 for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
2233 code = ord(char)
2234 if code < 0x100:
2235 char_size = 1 # sizeof(Py_UCS1)
2236 struct_size = ascii_struct_size
2237 elif code < 0x10000:
2238 char_size = 2 # sizeof(Py_UCS2)
2239 struct_size = compact_struct_size
2240 else:
2241 char_size = 4 # sizeof(Py_UCS4)
2242 struct_size = compact_struct_size
2243 # Note: sys.maxsize is half of the actual max allocation because of
Martin v. Löwis287eca62011-09-28 10:03:28 +02002244 # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
2245 # be allocatable, given enough memory.
2246 maxlen = ((sys.maxsize - struct_size) // char_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 alloc = lambda: char * maxlen
2248 self.assertRaises(MemoryError, alloc)
2249 self.assertRaises(MemoryError, alloc)
Antoine Pitrou3db3e872008-08-17 17:06:51 +00002250
Victor Stinner808fc0a2010-03-22 12:50:40 +00002251 def test_format_subclass(self):
2252 class S(str):
2253 def __str__(self):
2254 return '__str__ overridden'
2255 s = S('xxx')
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002256 self.assertEqual("%s" % s, '__str__ overridden')
2257 self.assertEqual("{}".format(s), '__str__ overridden')
Victor Stinner808fc0a2010-03-22 12:50:40 +00002258
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002259 # Test PyUnicode_FromFormat()
Victor Stinner1205f272010-09-11 00:54:47 +00002260 def test_from_format(self):
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002261 support.import_module('ctypes')
Victor Stinner15a11362012-10-06 23:48:20 +02002262 from ctypes import (
2263 pythonapi, py_object, sizeof,
Victor Stinner6d970f42011-03-02 00:04:25 +00002264 c_int, c_long, c_longlong, c_ssize_t,
Victor Stinner15a11362012-10-06 23:48:20 +02002265 c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002266 name = "PyUnicode_FromFormat"
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002267 _PyUnicode_FromFormat = getattr(pythonapi, name)
2268 _PyUnicode_FromFormat.restype = py_object
2269
2270 def PyUnicode_FromFormat(format, *args):
2271 cargs = tuple(
2272 py_object(arg) if isinstance(arg, str) else arg
2273 for arg in args)
2274 return _PyUnicode_FromFormat(format, *cargs)
Victor Stinner1205f272010-09-11 00:54:47 +00002275
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002276 def check_format(expected, format, *args):
2277 text = PyUnicode_FromFormat(format, *args)
2278 self.assertEqual(expected, text)
2279
Victor Stinner1205f272010-09-11 00:54:47 +00002280 # ascii format, non-ascii argument
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002281 check_format('ascii\x7f=unicode\xe9',
2282 b'ascii\x7f=%U', 'unicode\xe9')
Victor Stinner1205f272010-09-11 00:54:47 +00002283
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002284 # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
2285 # raises an error
Ezio Melottied3a7d22010-12-01 02:32:32 +00002286 self.assertRaisesRegex(ValueError,
Victor Stinner1205f272010-09-11 00:54:47 +00002287 '^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
Victor Stinner4c7db312010-09-12 07:51:18 +00002288 'string, got a non-ASCII byte: 0xe9$',
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002289 PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00002290
Victor Stinner96865452011-03-01 23:44:09 +00002291 # test "%c"
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002292 check_format('\uabcd',
2293 b'%c', c_int(0xabcd))
2294 check_format('\U0010ffff',
2295 b'%c', c_int(0x10ffff))
Serhiy Storchaka8eeae212013-06-23 20:12:14 +03002296 with self.assertRaises(OverflowError):
2297 PyUnicode_FromFormat(b'%c', c_int(0x110000))
Serhiy Storchaka31b1c8b2013-06-12 09:20:44 +03002298 # Issue #18183
Serhiy Storchakaf15ffe02013-06-12 09:28:20 +03002299 check_format('\U00010000\U00100000',
2300 b'%c%c', c_int(0x10000), c_int(0x100000))
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002301
Victor Stinner96865452011-03-01 23:44:09 +00002302 # test "%"
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002303 check_format('%',
2304 b'%')
2305 check_format('%',
2306 b'%%')
2307 check_format('%s',
2308 b'%%s')
2309 check_format('[%]',
2310 b'[%%]')
2311 check_format('%abc',
2312 b'%%%s', b'abc')
2313
2314 # truncated string
2315 check_format('abc',
2316 b'%.3s', b'abcdef')
2317 check_format('abc[\ufffd',
2318 b'%.5s', 'abc[\u20ac]'.encode('utf8'))
2319 check_format("'\\u20acABC'",
2320 b'%A', '\u20acABC')
2321 check_format("'\\u20",
2322 b'%.5A', '\u20acABCDEF')
2323 check_format("'\u20acABC'",
2324 b'%R', '\u20acABC')
2325 check_format("'\u20acA",
2326 b'%.3R', '\u20acABCDEF')
2327 check_format('\u20acAB',
2328 b'%.3S', '\u20acABCDEF')
2329 check_format('\u20acAB',
2330 b'%.3U', '\u20acABCDEF')
2331 check_format('\u20acAB',
2332 b'%.3V', '\u20acABCDEF', None)
2333 check_format('abc[\ufffd',
2334 b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
2335
2336 # following tests comes from #7330
2337 # test width modifier and precision modifier with %S
2338 check_format("repr= abc",
2339 b'repr=%5S', 'abc')
2340 check_format("repr=ab",
2341 b'repr=%.2S', 'abc')
2342 check_format("repr= ab",
2343 b'repr=%5.2S', 'abc')
2344
2345 # test width modifier and precision modifier with %R
2346 check_format("repr= 'abc'",
2347 b'repr=%8R', 'abc')
2348 check_format("repr='ab",
2349 b'repr=%.3R', 'abc')
2350 check_format("repr= 'ab",
2351 b'repr=%5.3R', 'abc')
2352
2353 # test width modifier and precision modifier with %A
2354 check_format("repr= 'abc'",
2355 b'repr=%8A', 'abc')
2356 check_format("repr='ab",
2357 b'repr=%.3A', 'abc')
2358 check_format("repr= 'ab",
2359 b'repr=%5.3A', 'abc')
2360
2361 # test width modifier and precision modifier with %s
2362 check_format("repr= abc",
2363 b'repr=%5s', b'abc')
2364 check_format("repr=ab",
2365 b'repr=%.2s', b'abc')
2366 check_format("repr= ab",
2367 b'repr=%5.2s', b'abc')
2368
2369 # test width modifier and precision modifier with %U
2370 check_format("repr= abc",
2371 b'repr=%5U', 'abc')
2372 check_format("repr=ab",
2373 b'repr=%.2U', 'abc')
2374 check_format("repr= ab",
2375 b'repr=%5.2U', 'abc')
2376
2377 # test width modifier and precision modifier with %V
2378 check_format("repr= abc",
2379 b'repr=%5V', 'abc', b'123')
2380 check_format("repr=ab",
2381 b'repr=%.2V', 'abc', b'123')
2382 check_format("repr= ab",
2383 b'repr=%5.2V', 'abc', b'123')
2384 check_format("repr= 123",
2385 b'repr=%5V', None, b'123')
2386 check_format("repr=12",
2387 b'repr=%.2V', None, b'123')
2388 check_format("repr= 12",
2389 b'repr=%5.2V', None, b'123')
Victor Stinner96865452011-03-01 23:44:09 +00002390
Victor Stinner6d970f42011-03-02 00:04:25 +00002391 # test integer formats (%i, %d, %u)
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002392 check_format('010',
2393 b'%03i', c_int(10))
2394 check_format('0010',
2395 b'%0.4i', c_int(10))
2396 check_format('-123',
2397 b'%i', c_int(-123))
2398 check_format('-123',
2399 b'%li', c_long(-123))
2400 check_format('-123',
2401 b'%lli', c_longlong(-123))
2402 check_format('-123',
2403 b'%zi', c_ssize_t(-123))
Victor Stinner96865452011-03-01 23:44:09 +00002404
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002405 check_format('-123',
2406 b'%d', c_int(-123))
2407 check_format('-123',
2408 b'%ld', c_long(-123))
2409 check_format('-123',
2410 b'%lld', c_longlong(-123))
2411 check_format('-123',
2412 b'%zd', c_ssize_t(-123))
Victor Stinner96865452011-03-01 23:44:09 +00002413
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002414 check_format('123',
2415 b'%u', c_uint(123))
2416 check_format('123',
2417 b'%lu', c_ulong(123))
2418 check_format('123',
2419 b'%llu', c_ulonglong(123))
2420 check_format('123',
2421 b'%zu', c_size_t(123))
Victor Stinner6d970f42011-03-02 00:04:25 +00002422
Victor Stinner15a11362012-10-06 23:48:20 +02002423 # test long output
2424 min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
2425 max_longlong = -min_longlong - 1
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002426 check_format(str(min_longlong),
2427 b'%lld', c_longlong(min_longlong))
2428 check_format(str(max_longlong),
2429 b'%lld', c_longlong(max_longlong))
Victor Stinner15a11362012-10-06 23:48:20 +02002430 max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002431 check_format(str(max_ulonglong),
2432 b'%llu', c_ulonglong(max_ulonglong))
Victor Stinner15a11362012-10-06 23:48:20 +02002433 PyUnicode_FromFormat(b'%p', c_void_p(-1))
2434
Victor Stinnere215d962012-10-06 23:03:36 +02002435 # test padding (width and/or precision)
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002436 check_format('123'.rjust(10, '0'),
2437 b'%010i', c_int(123))
2438 check_format('123'.rjust(100),
2439 b'%100i', c_int(123))
2440 check_format('123'.rjust(100, '0'),
2441 b'%.100i', c_int(123))
2442 check_format('123'.rjust(80, '0').rjust(100),
2443 b'%100.80i', c_int(123))
Victor Stinnere215d962012-10-06 23:03:36 +02002444
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002445 check_format('123'.rjust(10, '0'),
2446 b'%010u', c_uint(123))
2447 check_format('123'.rjust(100),
2448 b'%100u', c_uint(123))
2449 check_format('123'.rjust(100, '0'),
2450 b'%.100u', c_uint(123))
2451 check_format('123'.rjust(80, '0').rjust(100),
2452 b'%100.80u', c_uint(123))
Victor Stinnere215d962012-10-06 23:03:36 +02002453
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002454 check_format('123'.rjust(10, '0'),
2455 b'%010x', c_int(0x123))
2456 check_format('123'.rjust(100),
2457 b'%100x', c_int(0x123))
2458 check_format('123'.rjust(100, '0'),
2459 b'%.100x', c_int(0x123))
2460 check_format('123'.rjust(80, '0').rjust(100),
2461 b'%100.80x', c_int(0x123))
Victor Stinnere215d962012-10-06 23:03:36 +02002462
Victor Stinner6d970f42011-03-02 00:04:25 +00002463 # test %A
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002464 check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
2465 b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
Victor Stinner9a909002010-10-18 20:59:24 +00002466
Victor Stinner6d970f42011-03-02 00:04:25 +00002467 # test %V
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002468 check_format('repr=abc',
2469 b'repr=%V', 'abc', b'xyz')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002470
2471 # Test string decode from parameter of %s using utf-8.
2472 # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
2473 # '\u4eba\u6c11'
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002474 check_format('repr=\u4eba\u6c11',
2475 b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002476
2477 #Test replace error handler.
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002478 check_format('repr=abc\ufffd',
2479 b'repr=%V', None, b'abc\xff')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002480
Victor Stinner6d970f42011-03-02 00:04:25 +00002481 # not supported: copy the raw format string. these tests are just here
2482 # to check for crashs and should not be considered as specifications
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002483 check_format('%s',
2484 b'%1%s', b'abc')
2485 check_format('%1abc',
2486 b'%1abc')
2487 check_format('%+i',
2488 b'%+i', c_int(10))
2489 check_format('%.%s',
2490 b'%.%s', b'abc')
Victor Stinner6d970f42011-03-02 00:04:25 +00002491
Victor Stinner1c24bd02010-10-02 11:03:13 +00002492 # Test PyUnicode_AsWideChar()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002493 @support.cpython_only
Victor Stinner1c24bd02010-10-02 11:03:13 +00002494 def test_aswidechar(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002495 from _testcapi import unicode_aswidechar
Antoine Pitrou0662bc22010-11-22 16:19:04 +00002496 support.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002497 from ctypes import c_wchar, sizeof
2498
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002499 wchar, size = unicode_aswidechar('abcdef', 2)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002500 self.assertEqual(size, 2)
2501 self.assertEqual(wchar, 'ab')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002502
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002503 wchar, size = unicode_aswidechar('abc', 3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002504 self.assertEqual(size, 3)
2505 self.assertEqual(wchar, 'abc')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002506
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002507 wchar, size = unicode_aswidechar('abc', 4)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002508 self.assertEqual(size, 3)
2509 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002510
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002511 wchar, size = unicode_aswidechar('abc', 10)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002512 self.assertEqual(size, 3)
2513 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002514
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002515 wchar, size = unicode_aswidechar('abc\0def', 20)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002516 self.assertEqual(size, 7)
2517 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002518
Victor Stinner5593d8a2010-10-02 11:11:27 +00002519 nonbmp = chr(0x10ffff)
2520 if sizeof(c_wchar) == 2:
2521 buflen = 3
2522 nchar = 2
2523 else: # sizeof(c_wchar) == 4
2524 buflen = 2
2525 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002526 wchar, size = unicode_aswidechar(nonbmp, buflen)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002527 self.assertEqual(size, nchar)
2528 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00002529
Victor Stinner1c24bd02010-10-02 11:03:13 +00002530 # Test PyUnicode_AsWideCharString()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002531 @support.cpython_only
Victor Stinner1c24bd02010-10-02 11:03:13 +00002532 def test_aswidecharstring(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002533 from _testcapi import unicode_aswidecharstring
Antoine Pitrou0662bc22010-11-22 16:19:04 +00002534 support.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002535 from ctypes import c_wchar, sizeof
2536
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002537 wchar, size = unicode_aswidecharstring('abc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00002538 self.assertEqual(size, 3)
2539 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002540
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002541 wchar, size = unicode_aswidecharstring('abc\0def')
Ezio Melottib3aedd42010-11-20 19:04:17 +00002542 self.assertEqual(size, 7)
2543 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002544
Victor Stinner5593d8a2010-10-02 11:11:27 +00002545 nonbmp = chr(0x10ffff)
2546 if sizeof(c_wchar) == 2:
2547 nchar = 2
2548 else: # sizeof(c_wchar) == 4
2549 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002550 wchar, size = unicode_aswidecharstring(nonbmp)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002551 self.assertEqual(size, nchar)
2552 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00002553
Benjamin Peterson811c2f12011-09-30 21:31:21 -04002554 def test_subclass_add(self):
2555 class S(str):
2556 def __add__(self, o):
2557 return "3"
2558 self.assertEqual(S("4") + S("5"), "3")
2559 class S(str):
2560 def __iadd__(self, o):
2561 return "3"
2562 s = S("1")
2563 s += "4"
2564 self.assertEqual(s, "3")
2565
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002566 @support.cpython_only
Victor Stinner42bf7752011-11-21 22:52:58 +01002567 def test_encode_decimal(self):
2568 from _testcapi import unicode_encodedecimal
2569 self.assertEqual(unicode_encodedecimal('123'),
2570 b'123')
2571 self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
2572 b'3.14')
2573 self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2574 b' 3.14 ')
2575 self.assertRaises(UnicodeEncodeError,
2576 unicode_encodedecimal, "123\u20ac", "strict")
Victor Stinner6345be92011-11-25 20:09:01 +01002577 self.assertRaisesRegex(
2578 ValueError,
2579 "^'decimal' codec can't encode character",
2580 unicode_encodedecimal, "123\u20ac", "replace")
Victor Stinner42bf7752011-11-21 22:52:58 +01002581
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002582 @support.cpython_only
Victor Stinner42bf7752011-11-21 22:52:58 +01002583 def test_transform_decimal(self):
2584 from _testcapi import unicode_transformdecimaltoascii as transform_decimal
2585 self.assertEqual(transform_decimal('123'),
2586 '123')
2587 self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
2588 '3.14')
2589 self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2590 "\N{EM SPACE}3.14\N{EN SPACE}")
2591 self.assertEqual(transform_decimal('123\u20ac'),
2592 '123\u20ac')
2593
Victor Stinnerc814a382011-11-22 01:06:15 +01002594 def test_getnewargs(self):
2595 text = 'abc'
2596 args = text.__getnewargs__()
2597 self.assertIsNot(args[0], text)
2598 self.assertEqual(args[0], text)
2599 self.assertEqual(len(args), 1)
2600
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01002601 def test_resize(self):
2602 for length in range(1, 100, 7):
2603 # generate a fresh string (refcount=1)
2604 text = 'a' * length + 'b'
2605
Ezio Melotti51e243f2013-02-20 23:56:01 +02002606 with support.check_warnings(('unicode_internal codec has been '
2607 'deprecated', DeprecationWarning)):
2608 # fill wstr internal field
2609 abc = text.encode('unicode_internal')
2610 self.assertEqual(abc.decode('unicode_internal'), text)
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01002611
Ezio Melotti51e243f2013-02-20 23:56:01 +02002612 # resize text: wstr field must be cleared and then recomputed
2613 text += 'c'
2614 abcdef = text.encode('unicode_internal')
2615 self.assertNotEqual(abc, abcdef)
2616 self.assertEqual(abcdef.decode('unicode_internal'), text)
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01002617
Victor Stinner9fc59812013-04-08 22:34:43 +02002618 def test_compare(self):
2619 # Issue #17615
2620 N = 10
2621 ascii = 'a' * N
2622 ascii2 = 'z' * N
2623 latin = '\x80' * N
2624 latin2 = '\xff' * N
2625 bmp = '\u0100' * N
2626 bmp2 = '\uffff' * N
2627 astral = '\U00100000' * N
2628 astral2 = '\U0010ffff' * N
2629 strings = (
2630 ascii, ascii2,
2631 latin, latin2,
2632 bmp, bmp2,
2633 astral, astral2)
2634 for text1, text2 in itertools.combinations(strings, 2):
2635 equal = (text1 is text2)
2636 self.assertEqual(text1 == text2, equal)
2637 self.assertEqual(text1 != text2, not equal)
2638
2639 if equal:
2640 self.assertTrue(text1 <= text2)
2641 self.assertTrue(text1 >= text2)
2642
2643 # text1 is text2: duplicate strings to skip the "str1 == str2"
2644 # optimization in unicode_compare_eq() and really compare
2645 # character per character
2646 copy1 = duplicate_string(text1)
2647 copy2 = duplicate_string(text2)
2648 self.assertIsNot(copy1, copy2)
2649
2650 self.assertTrue(copy1 == copy2)
2651 self.assertFalse(copy1 != copy2)
2652
2653 self.assertTrue(copy1 <= copy2)
2654 self.assertTrue(copy2 >= copy2)
2655
2656 self.assertTrue(ascii < ascii2)
2657 self.assertTrue(ascii < latin)
2658 self.assertTrue(ascii < bmp)
2659 self.assertTrue(ascii < astral)
2660 self.assertFalse(ascii >= ascii2)
2661 self.assertFalse(ascii >= latin)
2662 self.assertFalse(ascii >= bmp)
2663 self.assertFalse(ascii >= astral)
2664
2665 self.assertFalse(latin < ascii)
2666 self.assertTrue(latin < latin2)
2667 self.assertTrue(latin < bmp)
2668 self.assertTrue(latin < astral)
2669 self.assertTrue(latin >= ascii)
2670 self.assertFalse(latin >= latin2)
2671 self.assertFalse(latin >= bmp)
2672 self.assertFalse(latin >= astral)
2673
2674 self.assertFalse(bmp < ascii)
2675 self.assertFalse(bmp < latin)
2676 self.assertTrue(bmp < bmp2)
2677 self.assertTrue(bmp < astral)
2678 self.assertTrue(bmp >= ascii)
2679 self.assertTrue(bmp >= latin)
2680 self.assertFalse(bmp >= bmp2)
2681 self.assertFalse(bmp >= astral)
2682
2683 self.assertFalse(astral < ascii)
2684 self.assertFalse(astral < latin)
2685 self.assertFalse(astral < bmp2)
2686 self.assertTrue(astral < astral2)
2687 self.assertTrue(astral >= ascii)
2688 self.assertTrue(astral >= latin)
2689 self.assertTrue(astral >= bmp2)
2690 self.assertFalse(astral >= astral2)
2691
Victor Stinner1c24bd02010-10-02 11:03:13 +00002692
Eric Smitha1eac722011-01-29 11:15:35 +00002693class StringModuleTest(unittest.TestCase):
2694 def test_formatter_parser(self):
2695 def parse(format):
2696 return list(_string.formatter_parser(format))
2697
2698 formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
2699 self.assertEqual(formatter, [
2700 ('prefix ', '2', '', 's'),
2701 ('xxx', '0', '^+10.3f', None),
2702 ('', 'obj.attr', '', 's'),
2703 (' ', 'z[0]', '10', 's'),
2704 ])
2705
2706 formatter = parse("prefix {} suffix")
2707 self.assertEqual(formatter, [
2708 ('prefix ', '', '', None),
2709 (' suffix', None, None, None),
2710 ])
2711
2712 formatter = parse("str")
2713 self.assertEqual(formatter, [
2714 ('str', None, None, None),
2715 ])
2716
2717 formatter = parse("")
2718 self.assertEqual(formatter, [])
2719
2720 formatter = parse("{0}")
2721 self.assertEqual(formatter, [
2722 ('', '0', '', None),
2723 ])
2724
2725 self.assertRaises(TypeError, _string.formatter_parser, 1)
2726
2727 def test_formatter_field_name_split(self):
2728 def split(name):
2729 items = list(_string.formatter_field_name_split(name))
2730 items[1] = list(items[1])
2731 return items
2732 self.assertEqual(split("obj"), ["obj", []])
2733 self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
2734 self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
2735 self.assertEqual(split("obj.arg[key1][key2]"), [
2736 "obj",
2737 [(True, 'arg'),
2738 (False, 'key1'),
2739 (False, 'key2'),
2740 ]])
2741 self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
2742
2743
Walter Dörwald28256f22003-01-19 16:59:20 +00002744if __name__ == "__main__":
Ezio Melotti0dceb562013-01-10 07:43:26 +02002745 unittest.main()