blob: e4cd99b06f6581b4d9bbbe515069984b92182569 [file] [log] [blame]
Guido van Rossuma831cac2000-03-10 23:23:21 +00001""" Test script for the Unicode implementation.
2
Guido van Rossuma831cac2000-03-10 23:23:21 +00003Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
Marc-André Lemburg36619082001-01-17 19:11:13 +00007"""#"
Victor Stinner040e16e2011-11-15 22:44:05 +01008import _string
Guido van Rossum98297ee2007-11-06 21:34:58 +00009import codecs
Victor Stinner9fc59812013-04-08 22:34:43 +020010import itertools
Ethan Furman9ab74802014-03-21 06:38:46 -070011import operator
Guido van Rossum98297ee2007-11-06 21:34:58 +000012import struct
13import sys
14import unittest
15import warnings
Benjamin Petersonee8712c2008-05-20 21:35:26 +000016from test import support, string_tests
Guido van Rossuma831cac2000-03-10 23:23:21 +000017
Neal Norwitz430f68b2005-11-24 22:00:56 +000018# Error handling (bad decoder return)
19def search_function(encoding):
20 def decode1(input, errors="strict"):
21 return 42 # not a tuple
22 def encode1(input, errors="strict"):
23 return 42 # not a tuple
24 def encode2(input, errors="strict"):
25 return (42, 42) # no unicode
26 def decode2(input, errors="strict"):
27 return (42, 42) # no unicode
28 if encoding=="test.unicode1":
29 return (encode1, decode1, None, None)
30 elif encoding=="test.unicode2":
31 return (encode2, decode2, None, None)
32 else:
33 return None
34codecs.register(search_function)
35
Victor Stinner9fc59812013-04-08 22:34:43 +020036def duplicate_string(text):
37 """
38 Try to get a fresh clone of the specified text:
39 new object with a reference count of 1.
40
41 This is a best-effort: latin1 single letters and the empty
42 string ('') are singletons and cannot be cloned.
43 """
44 return text.encode().decode()
45
Brett Cannon226b2302010-03-20 22:22:22 +000046class UnicodeTest(string_tests.CommonTest,
47 string_tests.MixinStrUnicodeUserStringTest,
Ezio Melotti0dceb562013-01-10 07:43:26 +020048 string_tests.MixinStrUnicodeTest,
49 unittest.TestCase):
Brett Cannon226b2302010-03-20 22:22:22 +000050
Guido van Rossumef87d6e2007-05-02 19:09:54 +000051 type2test = str
Walter Dörwald0fd583c2003-02-21 12:53:50 +000052
53 def checkequalnofix(self, result, object, methodname, *args):
54 method = getattr(object, methodname)
55 realresult = method(*args)
56 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000057 self.assertTrue(type(realresult) is type(result))
Walter Dörwald0fd583c2003-02-21 12:53:50 +000058
59 # if the original is returned make sure that
60 # this doesn't happen with subclasses
61 if realresult is object:
Guido van Rossumef87d6e2007-05-02 19:09:54 +000062 class usub(str):
Walter Dörwald0fd583c2003-02-21 12:53:50 +000063 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000064 return 'usub(%r)' % str.__repr__(self)
Walter Dörwald0fd583c2003-02-21 12:53:50 +000065 object = usub(object)
66 method = getattr(object, methodname)
67 realresult = method(*args)
68 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000069 self.assertTrue(object is not realresult)
Guido van Rossume4874ae2001-09-21 15:36:41 +000070
Jeremy Hylton504de6b2003-10-06 05:08:26 +000071 def test_literals(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000072 self.assertEqual('\xff', '\u00ff')
73 self.assertEqual('\uffff', '\U0000ffff')
Guido van Rossum36e0a922007-07-20 04:05:57 +000074 self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
75 self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
76 self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
Benjamin Petersoncd76c272008-04-05 15:09:30 +000077 # raw strings should not have unicode escapes
Florent Xiclunaa87b3832010-09-13 02:28:18 +000078 self.assertNotEqual(r"\u0020", " ")
Jeremy Hylton504de6b2003-10-06 05:08:26 +000079
Georg Brandl559e5d72008-06-11 18:37:52 +000080 def test_ascii(self):
81 if not sys.platform.startswith('java'):
82 # Test basic sanity of repr()
83 self.assertEqual(ascii('abc'), "'abc'")
84 self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
85 self.assertEqual(ascii('ab\\'), "'ab\\\\'")
86 self.assertEqual(ascii('\\c'), "'\\\\c'")
87 self.assertEqual(ascii('\\'), "'\\\\'")
88 self.assertEqual(ascii('\n'), "'\\n'")
89 self.assertEqual(ascii('\r'), "'\\r'")
90 self.assertEqual(ascii('\t'), "'\\t'")
91 self.assertEqual(ascii('\b'), "'\\x08'")
92 self.assertEqual(ascii("'\""), """'\\'"'""")
93 self.assertEqual(ascii("'\""), """'\\'"'""")
94 self.assertEqual(ascii("'"), '''"'"''')
95 self.assertEqual(ascii('"'), """'"'""")
96 latin1repr = (
97 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
98 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
99 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
100 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
101 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
102 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
103 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
104 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
105 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
106 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
107 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
108 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
109 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
110 "\\xfe\\xff'")
111 testrepr = ascii(''.join(map(chr, range(256))))
112 self.assertEqual(testrepr, latin1repr)
113 # Test ascii works on wide unicode escapes without overflow.
114 self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
115 ascii("\U00010000" * 39 + "\uffff" * 4096))
116
117 class WrongRepr:
118 def __repr__(self):
119 return b'byte-repr'
120 self.assertRaises(TypeError, ascii, WrongRepr())
121
Walter Dörwald28256f22003-01-19 16:59:20 +0000122 def test_repr(self):
123 if not sys.platform.startswith('java'):
124 # Test basic sanity of repr()
Walter Dörwald67e83882007-05-05 12:26:27 +0000125 self.assertEqual(repr('abc'), "'abc'")
126 self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
127 self.assertEqual(repr('ab\\'), "'ab\\\\'")
128 self.assertEqual(repr('\\c'), "'\\\\c'")
129 self.assertEqual(repr('\\'), "'\\\\'")
130 self.assertEqual(repr('\n'), "'\\n'")
131 self.assertEqual(repr('\r'), "'\\r'")
132 self.assertEqual(repr('\t'), "'\\t'")
133 self.assertEqual(repr('\b'), "'\\x08'")
134 self.assertEqual(repr("'\""), """'\\'"'""")
135 self.assertEqual(repr("'\""), """'\\'"'""")
136 self.assertEqual(repr("'"), '''"'"''')
137 self.assertEqual(repr('"'), """'"'""")
Walter Dörwald28256f22003-01-19 16:59:20 +0000138 latin1repr = (
Walter Dörwald67e83882007-05-05 12:26:27 +0000139 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
Walter Dörwald28256f22003-01-19 16:59:20 +0000140 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
141 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
142 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
143 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
144 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
Georg Brandl559e5d72008-06-11 18:37:52 +0000145 "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
146 "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
147 "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
148 "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
149 "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
150 "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
151 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
152 "\xfe\xff'")
Guido van Rossum805365e2007-05-07 22:24:25 +0000153 testrepr = repr(''.join(map(chr, range(256))))
Walter Dörwald28256f22003-01-19 16:59:20 +0000154 self.assertEqual(testrepr, latin1repr)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000155 # Test repr works on wide unicode escapes without overflow.
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000156 self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
157 repr("\U00010000" * 39 + "\uffff" * 4096))
Walter Dörwald28256f22003-01-19 16:59:20 +0000158
Georg Brandl559e5d72008-06-11 18:37:52 +0000159 class WrongRepr:
160 def __repr__(self):
161 return b'byte-repr'
162 self.assertRaises(TypeError, repr, WrongRepr())
163
Guido van Rossum49d6b072006-08-17 21:11:47 +0000164 def test_iterators(self):
165 # Make sure unicode objects have an __iter__ method
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000166 it = "\u1111\u2222\u3333".__iter__()
167 self.assertEqual(next(it), "\u1111")
168 self.assertEqual(next(it), "\u2222")
169 self.assertEqual(next(it), "\u3333")
Georg Brandla18af4e2007-04-21 15:47:16 +0000170 self.assertRaises(StopIteration, next, it)
Guido van Rossum49d6b072006-08-17 21:11:47 +0000171
Walter Dörwald28256f22003-01-19 16:59:20 +0000172 def test_count(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000173 string_tests.CommonTest.test_count(self)
174 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000175 self.checkequalnofix(3, 'aaa', 'count', 'a')
176 self.checkequalnofix(0, 'aaa', 'count', 'b')
177 self.checkequalnofix(3, 'aaa', 'count', 'a')
178 self.checkequalnofix(0, 'aaa', 'count', 'b')
179 self.checkequalnofix(0, 'aaa', 'count', 'b')
180 self.checkequalnofix(1, 'aaa', 'count', 'a', -1)
181 self.checkequalnofix(3, 'aaa', 'count', 'a', -10)
182 self.checkequalnofix(2, 'aaa', 'count', 'a', 0, -1)
183 self.checkequalnofix(0, 'aaa', 'count', 'a', 0, -10)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000184
Walter Dörwald28256f22003-01-19 16:59:20 +0000185 def test_find(self):
Antoine Pitrouc0bbe7d2011-10-08 22:41:35 +0200186 string_tests.CommonTest.test_find(self)
Antoine Pitrou2c3b2302011-10-11 20:29:21 +0200187 # test implementation details of the memchr fast path
188 self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102')
189 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201')
190 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120')
191 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220')
192 self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304')
193 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204')
194 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004')
195 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000196 self.checkequalnofix(0, 'abcdefghiabc', 'find', 'abc')
197 self.checkequalnofix(9, 'abcdefghiabc', 'find', 'abc', 1)
198 self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000199
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000200 self.assertRaises(TypeError, 'hello'.find)
201 self.assertRaises(TypeError, 'hello'.find, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000202
Walter Dörwald28256f22003-01-19 16:59:20 +0000203 def test_rfind(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000204 string_tests.CommonTest.test_rfind(self)
Antoine Pitrou2c3b2302011-10-11 20:29:21 +0200205 # test implementation details of the memrchr fast path
206 self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102')
207 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201')
208 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120')
209 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220')
210 self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304')
211 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204')
212 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004')
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000213 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000214 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', 'abc')
215 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
216 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
Guido van Rossum8b264542000-12-19 02:22:31 +0000217
Walter Dörwald28256f22003-01-19 16:59:20 +0000218 def test_index(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000219 string_tests.CommonTest.test_index(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000220 self.checkequalnofix(0, 'abcdefghiabc', 'index', '')
221 self.checkequalnofix(3, 'abcdefghiabc', 'index', 'def')
222 self.checkequalnofix(0, 'abcdefghiabc', 'index', 'abc')
223 self.checkequalnofix(9, 'abcdefghiabc', 'index', 'abc', 1)
224 self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
225 self.assertRaises(ValueError, 'abcdefghiab'.index, 'abc', 1)
226 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', 8)
227 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000228
Walter Dörwald28256f22003-01-19 16:59:20 +0000229 def test_rindex(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000230 string_tests.CommonTest.test_rindex(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000231 self.checkequalnofix(12, 'abcdefghiabc', 'rindex', '')
232 self.checkequalnofix(3, 'abcdefghiabc', 'rindex', 'def')
233 self.checkequalnofix(9, 'abcdefghiabc', 'rindex', 'abc')
234 self.checkequalnofix(0, 'abcdefghiabc', 'rindex', 'abc', 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000235
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000236 self.assertRaises(ValueError, 'abcdefghiabc'.rindex, 'hib')
237 self.assertRaises(ValueError, 'defghiabc'.rindex, 'def', 1)
238 self.assertRaises(ValueError, 'defghiabc'.rindex, 'abc', 0, -1)
239 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, 8)
240 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000241
Georg Brandlceee0772007-11-27 23:48:05 +0000242 def test_maketrans_translate(self):
243 # these work with plain translate()
244 self.checkequalnofix('bbbc', 'abababc', 'translate',
245 {ord('a'): None})
246 self.checkequalnofix('iiic', 'abababc', 'translate',
247 {ord('a'): None, ord('b'): ord('i')})
248 self.checkequalnofix('iiix', 'abababc', 'translate',
249 {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
250 self.checkequalnofix('c', 'abababc', 'translate',
251 {ord('a'): None, ord('b'): ''})
252 self.checkequalnofix('xyyx', 'xzx', 'translate',
253 {ord('z'): 'yy'})
Victor Stinner5a29f252014-04-05 00:17:51 +0200254
Georg Brandlceee0772007-11-27 23:48:05 +0000255 # this needs maketrans()
256 self.checkequalnofix('abababc', 'abababc', 'translate',
257 {'b': '<i>'})
258 tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
259 self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
260 # test alternative way of calling maketrans()
261 tbl = self.type2test.maketrans('abc', 'xyz', 'd')
262 self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
263
Victor Stinner5a29f252014-04-05 00:17:51 +0200264 # various tests switching from ASCII to latin1 or the opposite;
265 # same length, remove a letter, or replace with a longer string.
266 self.assertEqual("[a]".translate(str.maketrans('a', 'X')),
267 "[X]")
268 self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})),
269 "[X]")
270 self.assertEqual("[a]".translate(str.maketrans({'a': None})),
271 "[]")
272 self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})),
273 "[XXX]")
274 self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})),
275 "[\xe9]")
276 self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})),
277 "[<\xe9>]")
278 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})),
279 "[a]")
280 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})),
281 "[]")
282
Victor Stinner4ff33af2014-04-05 11:56:37 +0200283 # invalid Unicode characters
284 invalid_char = 0x10ffff+1
285 for before in "a\xe9\u20ac\U0010ffff":
286 mapping = str.maketrans({before: invalid_char})
287 text = "[%s]" % before
288 self.assertRaises(ValueError, text.translate, mapping)
289
290 # errors
Georg Brandlceee0772007-11-27 23:48:05 +0000291 self.assertRaises(TypeError, self.type2test.maketrans)
292 self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
293 self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
294 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
295 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
296 self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
297 self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000298
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000299 self.assertRaises(TypeError, 'hello'.translate)
Walter Dörwald67e83882007-05-05 12:26:27 +0000300 self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000301
Walter Dörwald28256f22003-01-19 16:59:20 +0000302 def test_split(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000303 string_tests.CommonTest.test_split(self)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000304
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000305 # Mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000306 self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
307 self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
308 self.checkequalnofix(['endcase ', ''], 'endcase test', 'split', 'test')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000309
Walter Dörwald28256f22003-01-19 16:59:20 +0000310 def test_join(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000311 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000312
Guido van Rossumf1044292007-09-27 18:01:22 +0000313 class MyWrapper:
314 def __init__(self, sval): self.sval = sval
315 def __str__(self): return self.sval
316
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000317 # mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000318 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
319 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
320 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
321 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
322 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
323 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
324 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
Guido van Rossum98297ee2007-11-06 21:34:58 +0000325 self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
326 self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
327 self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
328 self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
Marc-André Lemburge5034372000-08-08 08:04:29 +0000329
Walter Dörwald28256f22003-01-19 16:59:20 +0000330 def test_replace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000331 string_tests.CommonTest.test_replace(self)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000332
Walter Dörwald28256f22003-01-19 16:59:20 +0000333 # method call forwarded from str implementation because of unicode argument
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000334 self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
335 self.assertRaises(TypeError, 'replace'.replace, "r", 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000336
Victor Stinner59de0ee2011-10-07 10:01:28 +0200337 @support.cpython_only
338 def test_replace_id(self):
Victor Stinner1d972ad2011-10-07 13:31:46 +0200339 pattern = 'abc'
340 text = 'abc def'
341 self.assertIs(text.replace(pattern, pattern), text)
Victor Stinner59de0ee2011-10-07 10:01:28 +0200342
Guido van Rossum98297ee2007-11-06 21:34:58 +0000343 def test_bytes_comparison(self):
Brett Cannon226b2302010-03-20 22:22:22 +0000344 with support.check_warnings():
345 warnings.simplefilter('ignore', BytesWarning)
346 self.assertEqual('abc' == b'abc', False)
347 self.assertEqual('abc' != b'abc', True)
348 self.assertEqual('abc' == bytearray(b'abc'), False)
349 self.assertEqual('abc' != bytearray(b'abc'), True)
Brett Cannon40430012007-10-22 20:24:51 +0000350
Walter Dörwald28256f22003-01-19 16:59:20 +0000351 def test_comparison(self):
352 # Comparisons:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000353 self.assertEqual('abc', 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000354 self.assertTrue('abcd' > 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000355 self.assertTrue('abc' < 'abcd')
Walter Dörwald28256f22003-01-19 16:59:20 +0000356
357 if 0:
358 # Move these tests to a Unicode collation module test...
359 # Testing UTF-16 code point order comparisons...
360
361 # No surrogates, no fixup required.
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000362 self.assertTrue('\u0061' < '\u20ac')
Walter Dörwald28256f22003-01-19 16:59:20 +0000363 # Non surrogate below surrogate value, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000364 self.assertTrue('\u0061' < '\ud800\udc02')
Walter Dörwald28256f22003-01-19 16:59:20 +0000365
366 # Non surrogate above surrogate value, fixup required
367 def test_lecmp(s, s2):
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000368 self.assertTrue(s < s2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000369
370 def test_fixup(s):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000371 s2 = '\ud800\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000372 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000373 s2 = '\ud900\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000374 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000375 s2 = '\uda00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000376 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000377 s2 = '\udb00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000378 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000379 s2 = '\ud800\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000380 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000381 s2 = '\ud900\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000382 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000383 s2 = '\uda00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000384 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000385 s2 = '\udb00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000386 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000387 s2 = '\ud800\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000388 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000389 s2 = '\ud900\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000390 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000391 s2 = '\uda00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000392 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000393 s2 = '\udb00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000394 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000395 s2 = '\ud800\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000396 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000397 s2 = '\ud900\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000398 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000399 s2 = '\uda00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000400 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000401 s2 = '\udb00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000402 test_lecmp(s, s2)
403
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000404 test_fixup('\ue000')
405 test_fixup('\uff61')
Walter Dörwald28256f22003-01-19 16:59:20 +0000406
407 # Surrogates on both sides, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000408 self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
Walter Dörwald28256f22003-01-19 16:59:20 +0000409
Walter Dörwald28256f22003-01-19 16:59:20 +0000410 def test_islower(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000411 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000412 self.checkequalnofix(False, '\u1FFc', 'islower')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500413 self.assertFalse('\u2167'.islower())
414 self.assertTrue('\u2177'.islower())
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300415 # non-BMP, uppercase
416 self.assertFalse('\U00010401'.islower())
417 self.assertFalse('\U00010427'.islower())
418 # non-BMP, lowercase
419 self.assertTrue('\U00010429'.islower())
420 self.assertTrue('\U0001044E'.islower())
421 # non-BMP, non-cased
422 self.assertFalse('\U0001F40D'.islower())
423 self.assertFalse('\U0001F46F'.islower())
Walter Dörwald28256f22003-01-19 16:59:20 +0000424
425 def test_isupper(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000426 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
427 if not sys.platform.startswith('java'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000428 self.checkequalnofix(False, '\u1FFc', 'isupper')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500429 self.assertTrue('\u2167'.isupper())
430 self.assertFalse('\u2177'.isupper())
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300431 # non-BMP, uppercase
432 self.assertTrue('\U00010401'.isupper())
433 self.assertTrue('\U00010427'.isupper())
434 # non-BMP, lowercase
435 self.assertFalse('\U00010429'.isupper())
436 self.assertFalse('\U0001044E'.isupper())
437 # non-BMP, non-cased
438 self.assertFalse('\U0001F40D'.isupper())
439 self.assertFalse('\U0001F46F'.isupper())
Walter Dörwald28256f22003-01-19 16:59:20 +0000440
441 def test_istitle(self):
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300442 string_tests.MixinStrUnicodeUserStringTest.test_istitle(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000443 self.checkequalnofix(True, '\u1FFc', 'istitle')
444 self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
Walter Dörwald28256f22003-01-19 16:59:20 +0000445
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300446 # non-BMP, uppercase + lowercase
447 self.assertTrue('\U00010401\U00010429'.istitle())
448 self.assertTrue('\U00010427\U0001044E'.istitle())
449 # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
450 for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
451 self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
452
Walter Dörwald28256f22003-01-19 16:59:20 +0000453 def test_isspace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000454 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000455 self.checkequalnofix(True, '\u2000', 'isspace')
456 self.checkequalnofix(True, '\u200a', 'isspace')
457 self.checkequalnofix(False, '\u2014', 'isspace')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300458 # apparently there are no non-BMP spaces chars in Unicode 6
459 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
460 '\U0001F40D', '\U0001F46F']:
461 self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
462
463 def test_isalnum(self):
464 string_tests.MixinStrUnicodeUserStringTest.test_isalnum(self)
465 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
466 '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
467 self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
Walter Dörwald28256f22003-01-19 16:59:20 +0000468
469 def test_isalpha(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000470 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000471 self.checkequalnofix(True, '\u1FFc', 'isalpha')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300472 # non-BMP, cased
473 self.assertTrue('\U00010401'.isalpha())
474 self.assertTrue('\U00010427'.isalpha())
475 self.assertTrue('\U00010429'.isalpha())
476 self.assertTrue('\U0001044E'.isalpha())
477 # non-BMP, non-cased
478 self.assertFalse('\U0001F40D'.isalpha())
479 self.assertFalse('\U0001F46F'.isalpha())
Walter Dörwald28256f22003-01-19 16:59:20 +0000480
481 def test_isdecimal(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000482 self.checkequalnofix(False, '', 'isdecimal')
483 self.checkequalnofix(False, 'a', 'isdecimal')
484 self.checkequalnofix(True, '0', 'isdecimal')
485 self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
486 self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
487 self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
488 self.checkequalnofix(True, '0123456789', 'isdecimal')
489 self.checkequalnofix(False, '0123456789a', 'isdecimal')
Walter Dörwald28256f22003-01-19 16:59:20 +0000490
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000491 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000492
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300493 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
494 '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
495 self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
496 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
497 self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
498
Walter Dörwald28256f22003-01-19 16:59:20 +0000499 def test_isdigit(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000500 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000501 self.checkequalnofix(True, '\u2460', 'isdigit')
502 self.checkequalnofix(False, '\xbc', 'isdigit')
503 self.checkequalnofix(True, '\u0660', 'isdigit')
Walter Dörwald28256f22003-01-19 16:59:20 +0000504
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300505 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
506 '\U0001F40D', '\U0001F46F', '\U00011065']:
507 self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
508 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
509 self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
510
Walter Dörwald28256f22003-01-19 16:59:20 +0000511 def test_isnumeric(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000512 self.checkequalnofix(False, '', 'isnumeric')
513 self.checkequalnofix(False, 'a', 'isnumeric')
514 self.checkequalnofix(True, '0', 'isnumeric')
515 self.checkequalnofix(True, '\u2460', 'isnumeric')
516 self.checkequalnofix(True, '\xbc', 'isnumeric')
517 self.checkequalnofix(True, '\u0660', 'isnumeric')
518 self.checkequalnofix(True, '0123456789', 'isnumeric')
519 self.checkequalnofix(False, '0123456789a', 'isnumeric')
Walter Dörwald28256f22003-01-19 16:59:20 +0000520
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000521 self.assertRaises(TypeError, "abc".isnumeric, 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000522
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300523 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
524 '\U0001F40D', '\U0001F46F']:
525 self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
526 for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
527 '\U000104A0', '\U0001F107']:
528 self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
529
Martin v. Löwis47383402007-08-15 07:32:56 +0000530 def test_isidentifier(self):
531 self.assertTrue("a".isidentifier())
532 self.assertTrue("Z".isidentifier())
533 self.assertTrue("_".isidentifier())
534 self.assertTrue("b0".isidentifier())
535 self.assertTrue("bc".isidentifier())
536 self.assertTrue("b_".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000537 self.assertTrue("µ".isidentifier())
Benjamin Petersonf413b802011-08-12 22:17:18 -0500538 self.assertTrue("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000539
540 self.assertFalse(" ".isidentifier())
541 self.assertFalse("[".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000542 self.assertFalse("©".isidentifier())
Georg Brandld52429f2008-07-04 15:55:02 +0000543 self.assertFalse("0".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000544
Georg Brandl559e5d72008-06-11 18:37:52 +0000545 def test_isprintable(self):
546 self.assertTrue("".isprintable())
Benjamin Peterson09832742009-03-26 17:15:46 +0000547 self.assertTrue(" ".isprintable())
Georg Brandl559e5d72008-06-11 18:37:52 +0000548 self.assertTrue("abcdefg".isprintable())
549 self.assertFalse("abcdefg\n".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000550 # some defined Unicode character
551 self.assertTrue("\u0374".isprintable())
552 # undefined character
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +0000553 self.assertFalse("\u0378".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000554 # single surrogate character
Georg Brandl559e5d72008-06-11 18:37:52 +0000555 self.assertFalse("\ud800".isprintable())
556
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300557 self.assertTrue('\U0001F46F'.isprintable())
558 self.assertFalse('\U000E0020'.isprintable())
559
560 def test_surrogates(self):
561 for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
562 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
563 self.assertTrue(s.islower())
564 self.assertFalse(s.isupper())
565 self.assertFalse(s.istitle())
566 for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
567 'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
568 self.assertFalse(s.islower())
569 self.assertTrue(s.isupper())
570 self.assertTrue(s.istitle())
571
572 for meth_name in ('islower', 'isupper', 'istitle'):
573 meth = getattr(str, meth_name)
574 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
575 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
576
577 for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
578 'isdecimal', 'isnumeric',
579 'isidentifier', 'isprintable'):
580 meth = getattr(str, meth_name)
581 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
582 'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
583 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
584 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
585
586
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300587 def test_lower(self):
588 string_tests.CommonTest.test_lower(self)
589 self.assertEqual('\U00010427'.lower(), '\U0001044F')
590 self.assertEqual('\U00010427\U00010427'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300591 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300592 self.assertEqual('\U00010427\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300593 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300594 self.assertEqual('X\U00010427x\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300595 'x\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500596 self.assertEqual('fi'.lower(), 'fi')
597 self.assertEqual('\u0130'.lower(), '\u0069\u0307')
598 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
599 self.assertEqual('\u03a3'.lower(), '\u03c3')
600 self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
601 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
602 self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
603 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
604 self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
605 self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
606 self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
607 self.assertEqual('\u2177'.lower(), '\u2177')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300608
Benjamin Petersond5890c82012-01-14 13:23:30 -0500609 def test_casefold(self):
610 self.assertEqual('hello'.casefold(), 'hello')
611 self.assertEqual('hELlo'.casefold(), 'hello')
612 self.assertEqual('ß'.casefold(), 'ss')
613 self.assertEqual('fi'.casefold(), 'fi')
614 self.assertEqual('\u03a3'.casefold(), '\u03c3')
615 self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
Benjamin Peterson4eda9372012-08-05 15:05:34 -0700616 self.assertEqual('\u00b5'.casefold(), '\u03bc')
Benjamin Petersond5890c82012-01-14 13:23:30 -0500617
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300618 def test_upper(self):
619 string_tests.CommonTest.test_upper(self)
620 self.assertEqual('\U0001044F'.upper(), '\U00010427')
621 self.assertEqual('\U0001044F\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300622 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300623 self.assertEqual('\U00010427\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300624 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300625 self.assertEqual('X\U00010427x\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300626 'X\U00010427X\U00010427')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500627 self.assertEqual('fi'.upper(), 'FI')
628 self.assertEqual('\u0130'.upper(), '\u0130')
629 self.assertEqual('\u03a3'.upper(), '\u03a3')
630 self.assertEqual('ß'.upper(), 'SS')
631 self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
632 self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
633 self.assertEqual('\u2177'.upper(), '\u2167')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300634
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300635 def test_capitalize(self):
636 string_tests.CommonTest.test_capitalize(self)
637 self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
638 self.assertEqual('\U0001044F\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300639 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300640 self.assertEqual('\U00010427\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300641 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300642 self.assertEqual('\U0001044F\U00010427'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300643 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300644 self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300645 'X\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500646 self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
647 exp = '\u0399\u0308\u0300\u0069\u0307'
648 self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
649 self.assertEqual('finnish'.capitalize(), 'FInnish')
650 self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300651
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300652 def test_title(self):
653 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
654 self.assertEqual('\U0001044F'.title(), '\U00010427')
655 self.assertEqual('\U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300656 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300657 self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300658 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300659 self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300660 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300661 self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300662 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300663 self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300664 'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500665 self.assertEqual('fiNNISH'.title(), 'Finnish')
666 self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
667 self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300668
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300669 def test_swapcase(self):
670 string_tests.CommonTest.test_swapcase(self)
671 self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
672 self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
673 self.assertEqual('\U0001044F\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300674 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300675 self.assertEqual('\U00010427\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300676 '\U0001044F\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300677 self.assertEqual('\U0001044F\U00010427'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300678 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300679 self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300680 'x\U0001044FX\U00010427')
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500681 self.assertEqual('fi'.swapcase(), 'FI')
682 self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
683 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
684 self.assertEqual('\u03a3'.swapcase(), '\u03c3')
685 self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
686 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
687 self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
688 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
689 self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
690 self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
691 self.assertEqual('\u03a3'.swapcase(), '\u03c3')
692 self.assertEqual('ß'.swapcase(), 'SS')
693 self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300694
Ezio Melottif84e01d2013-07-08 17:48:29 +0200695 def test_center(self):
696 string_tests.CommonTest.test_center(self)
697 self.assertEqual('x'.center(2, '\U0010FFFF'),
698 'x\U0010FFFF')
699 self.assertEqual('x'.center(3, '\U0010FFFF'),
700 '\U0010FFFFx\U0010FFFF')
701 self.assertEqual('x'.center(4, '\U0010FFFF'),
702 '\U0010FFFFx\U0010FFFF\U0010FFFF')
703
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400704 @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system")
Benjamin Peterson4d856892014-10-15 13:39:46 -0400705 @support.cpython_only
Benjamin Petersone1bd38c2014-10-15 11:47:36 -0400706 def test_case_operation_overflow(self):
707 # Issue #22643
708 self.assertRaises(OverflowError, ("ü"*(2**32//12 + 1)).upper)
709
Walter Dörwald28256f22003-01-19 16:59:20 +0000710 def test_contains(self):
711 # Testing Unicode contains method
Benjamin Peterson577473f2010-01-19 00:09:57 +0000712 self.assertIn('a', 'abdb')
713 self.assertIn('a', 'bdab')
714 self.assertIn('a', 'bdaba')
715 self.assertIn('a', 'bdba')
716 self.assertNotIn('a', 'bdb')
717 self.assertIn('a', 'bdba')
718 self.assertIn('a', ('a',1,None))
719 self.assertIn('a', (1,None,'a'))
720 self.assertIn('a', ('a',1,None))
721 self.assertIn('a', (1,None,'a'))
722 self.assertNotIn('a', ('x',1,'y'))
723 self.assertNotIn('a', ('x',1,None))
724 self.assertNotIn('abcd', 'abcxxxx')
725 self.assertIn('ab', 'abcd')
726 self.assertIn('ab', 'abc')
727 self.assertIn('ab', (1,None,'ab'))
728 self.assertIn('', 'abc')
729 self.assertIn('', '')
730 self.assertIn('', 'abc')
731 self.assertNotIn('\0', 'abc')
732 self.assertIn('\0', '\0abc')
733 self.assertIn('\0', 'abc\0')
734 self.assertIn('a', '\0abc')
735 self.assertIn('asdf', 'asdf')
736 self.assertNotIn('asdf', 'asd')
737 self.assertNotIn('asdf', '')
Walter Dörwald28256f22003-01-19 16:59:20 +0000738
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000739 self.assertRaises(TypeError, "abc".__contains__)
Walter Dörwald28256f22003-01-19 16:59:20 +0000740
Serhiy Storchaka31b1c8b2013-06-12 09:20:44 +0300741 def test_issue18183(self):
742 '\U00010000\U00100000'.lower()
743 '\U00010000\U00100000'.casefold()
744 '\U00010000\U00100000'.upper()
745 '\U00010000\U00100000'.capitalize()
746 '\U00010000\U00100000'.title()
747 '\U00010000\U00100000'.swapcase()
748 '\U00100000'.center(3, '\U00010000')
749 '\U00100000'.ljust(3, '\U00010000')
750 '\U00100000'.rjust(3, '\U00010000')
751
Eric Smith8c663262007-08-25 02:26:07 +0000752 def test_format(self):
753 self.assertEqual(''.format(), '')
754 self.assertEqual('a'.format(), 'a')
755 self.assertEqual('ab'.format(), 'ab')
756 self.assertEqual('a{{'.format(), 'a{')
757 self.assertEqual('a}}'.format(), 'a}')
758 self.assertEqual('{{b'.format(), '{b')
759 self.assertEqual('}}b'.format(), '}b')
760 self.assertEqual('a{{b'.format(), 'a{b')
761
762 # examples from the PEP:
763 import datetime
764 self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
765 self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
766 "My name is Fred")
767 self.assertEqual("My name is {0} :-{{}}".format('Fred'),
768 "My name is Fred :-{}")
769
770 d = datetime.date(2007, 8, 18)
771 self.assertEqual("The year is {0.year}".format(d),
772 "The year is 2007")
773
Eric Smith8c663262007-08-25 02:26:07 +0000774 # classes we'll use for testing
775 class C:
776 def __init__(self, x=100):
777 self._x = x
778 def __format__(self, spec):
779 return spec
780
781 class D:
782 def __init__(self, x):
783 self.x = x
784 def __format__(self, spec):
785 return str(self.x)
786
787 # class with __str__, but no __format__
788 class E:
789 def __init__(self, x):
790 self.x = x
791 def __str__(self):
792 return 'E(' + self.x + ')'
793
794 # class with __repr__, but no __format__ or __str__
795 class F:
796 def __init__(self, x):
797 self.x = x
798 def __repr__(self):
799 return 'F(' + self.x + ')'
800
801 # class with __format__ that forwards to string, for some format_spec's
802 class G:
803 def __init__(self, x):
804 self.x = x
805 def __str__(self):
806 return "string is " + self.x
807 def __format__(self, format_spec):
808 if format_spec == 'd':
809 return 'G(' + self.x + ')'
810 return object.__format__(self, format_spec)
811
Eric Smith739e2ad2007-08-27 19:07:22 +0000812 class I(datetime.date):
813 def __format__(self, format_spec):
814 return self.strftime(format_spec)
815
Eric Smith185e30c2007-08-30 22:23:08 +0000816 class J(int):
817 def __format__(self, format_spec):
818 return int.__format__(self * 2, format_spec)
819
Eric Smith8c663262007-08-25 02:26:07 +0000820
821 self.assertEqual(''.format(), '')
822 self.assertEqual('abc'.format(), 'abc')
823 self.assertEqual('{0}'.format('abc'), 'abc')
824 self.assertEqual('{0:}'.format('abc'), 'abc')
825# self.assertEqual('{ 0 }'.format('abc'), 'abc')
826 self.assertEqual('X{0}'.format('abc'), 'Xabc')
827 self.assertEqual('{0}X'.format('abc'), 'abcX')
828 self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
829 self.assertEqual('{1}'.format(1, 'abc'), 'abc')
830 self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
831 self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
832 self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
833 self.assertEqual('{0}'.format(-15), '-15')
834 self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
835 self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
836 self.assertEqual('{{'.format(), '{')
837 self.assertEqual('}}'.format(), '}')
838 self.assertEqual('{{}}'.format(), '{}')
839 self.assertEqual('{{x}}'.format(), '{x}')
840 self.assertEqual('{{{0}}}'.format(123), '{123}')
841 self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
842 self.assertEqual('}}{{'.format(), '}{')
843 self.assertEqual('}}x{{'.format(), '}x{')
844
Eric Smith7ade6482007-08-26 22:27:13 +0000845 # weird field names
846 self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
847 self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
Eric Smith4cb4e4e2007-09-03 08:40:29 +0000848 self.assertEqual("{0[ ]}".format({' ':3}), '3')
Eric Smith7ade6482007-08-26 22:27:13 +0000849
Eric Smith8c663262007-08-25 02:26:07 +0000850 self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
851 self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
852 self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
853 self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
854 self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
855 self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
856 self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
857
Eric Smith8c663262007-08-25 02:26:07 +0000858 # strings
859 self.assertEqual('{0:.3s}'.format('abc'), 'abc')
860 self.assertEqual('{0:.3s}'.format('ab'), 'ab')
861 self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
862 self.assertEqual('{0:.0s}'.format('abcdef'), '')
863 self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
864 self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
865 self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
866 self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
867 self.assertEqual('{0:x<0s}'.format('result'), 'result')
868 self.assertEqual('{0:x<5s}'.format('result'), 'result')
869 self.assertEqual('{0:x<6s}'.format('result'), 'result')
870 self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
871 self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
872 self.assertEqual('{0: <7s}'.format('result'), 'result ')
873 self.assertEqual('{0:<7s}'.format('result'), 'result ')
874 self.assertEqual('{0:>7s}'.format('result'), ' result')
875 self.assertEqual('{0:>8s}'.format('result'), ' result')
876 self.assertEqual('{0:^8s}'.format('result'), ' result ')
877 self.assertEqual('{0:^9s}'.format('result'), ' result ')
878 self.assertEqual('{0:^10s}'.format('result'), ' result ')
879 self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
880 self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
881 self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
882
Eric V. Smith2ea97122014-04-14 11:55:10 -0400883 # issue 12546: use \x00 as a fill character
884 self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
885 self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
886 self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
887 self.assertEqual('{0:^6s}'.format('foo'), ' foo ')
888
889 self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
890 self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
891 self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
892 self.assertEqual('{0:<6}'.format(3), '3 ')
893
894 self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
895 self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
896 self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
897 self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
898
899 self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
900 self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
901 self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
902 self.assertEqual('{0:^12}'.format(3+2.0j), ' (3+2j) ')
903
Eric Smith8c663262007-08-25 02:26:07 +0000904 # format specifiers for user defined type
905 self.assertEqual('{0:abc}'.format(C()), 'abc')
906
Georg Brandld52429f2008-07-04 15:55:02 +0000907 # !r, !s and !a coercions
Eric Smith8c663262007-08-25 02:26:07 +0000908 self.assertEqual('{0!s}'.format('Hello'), 'Hello')
909 self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
910 self.assertEqual('{0!s:15}'.format('Hello'), 'Hello ')
911 self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello ')
912 self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
913 self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
914 self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +0000915 self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +0000916 self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'") # printable
917 self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
Georg Brandl559e5d72008-06-11 18:37:52 +0000918 self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +0000919 self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +0000920 self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
Georg Brandl559e5d72008-06-11 18:37:52 +0000921 self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
922 self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
Georg Brandld52429f2008-07-04 15:55:02 +0000923 self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
Eric Smith8c663262007-08-25 02:26:07 +0000924
Eric Smith8c663262007-08-25 02:26:07 +0000925 # test fallback to object.__format__
926 self.assertEqual('{0}'.format({}), '{}')
927 self.assertEqual('{0}'.format([]), '[]')
928 self.assertEqual('{0}'.format([1]), '[1]')
Eric Smithe4d63172010-09-13 20:48:43 +0000929
Eric Smith8c663262007-08-25 02:26:07 +0000930 self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
Eric Smith8c663262007-08-25 02:26:07 +0000931 self.assertEqual('{0!s}'.format(G('data')), 'string is data')
932
Andrew Svetlov2cd8ce42012-12-23 14:27:17 +0200933 self.assertRaises(TypeError, '{0:^10}'.format, E('data'))
934 self.assertRaises(TypeError, '{0:^10s}'.format, E('data'))
935 self.assertRaises(TypeError, '{0:>15s}'.format, G('data'))
Eric Smithe4d63172010-09-13 20:48:43 +0000936
Eric Smith739e2ad2007-08-27 19:07:22 +0000937 self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
938 month=8,
939 day=27)),
940 "date: 2007-08-27")
941
Eric Smith185e30c2007-08-30 22:23:08 +0000942 # test deriving from a builtin type and overriding __format__
943 self.assertEqual("{0}".format(J(10)), "20")
944
945
Eric Smith8c663262007-08-25 02:26:07 +0000946 # string format specifiers
947 self.assertEqual('{0:}'.format('a'), 'a')
948
949 # computed format specifiers
950 self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
951 self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
952 self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
953 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello ')
954 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello ')
955
956 # test various errors
957 self.assertRaises(ValueError, '{'.format)
958 self.assertRaises(ValueError, '}'.format)
959 self.assertRaises(ValueError, 'a{'.format)
960 self.assertRaises(ValueError, 'a}'.format)
961 self.assertRaises(ValueError, '{a'.format)
962 self.assertRaises(ValueError, '}a'.format)
Eric Smith11529192007-09-04 23:04:22 +0000963 self.assertRaises(IndexError, '{0}'.format)
964 self.assertRaises(IndexError, '{1}'.format, 'abc')
965 self.assertRaises(KeyError, '{x}'.format)
Eric Smith8c663262007-08-25 02:26:07 +0000966 self.assertRaises(ValueError, "}{".format)
Eric Smith8c663262007-08-25 02:26:07 +0000967 self.assertRaises(ValueError, "abc{0:{}".format)
968 self.assertRaises(ValueError, "{0".format)
Eric Smith11529192007-09-04 23:04:22 +0000969 self.assertRaises(IndexError, "{0.}".format)
970 self.assertRaises(ValueError, "{0.}".format, 0)
Benjamin Peterson4d944742013-05-17 18:22:31 -0500971 self.assertRaises(ValueError, "{0[}".format)
Eric Smith4cb4e4e2007-09-03 08:40:29 +0000972 self.assertRaises(ValueError, "{0[}".format, [])
Eric Smith11529192007-09-04 23:04:22 +0000973 self.assertRaises(KeyError, "{0]}".format)
974 self.assertRaises(ValueError, "{0.[]}".format, 0)
Eric Smith7ade6482007-08-26 22:27:13 +0000975 self.assertRaises(ValueError, "{0..foo}".format, 0)
Eric Smith11529192007-09-04 23:04:22 +0000976 self.assertRaises(ValueError, "{0[0}".format, 0)
977 self.assertRaises(ValueError, "{0[0:foo}".format, 0)
978 self.assertRaises(KeyError, "{c]}".format)
979 self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
980 self.assertRaises(ValueError, "{0}}".format, 0)
981 self.assertRaises(KeyError, "{foo}".format, bar=3)
Eric Smith8c663262007-08-25 02:26:07 +0000982 self.assertRaises(ValueError, "{0!x}".format, 3)
Eric Smith11529192007-09-04 23:04:22 +0000983 self.assertRaises(ValueError, "{0!}".format, 0)
984 self.assertRaises(ValueError, "{0!rs}".format, 0)
Eric Smith8c663262007-08-25 02:26:07 +0000985 self.assertRaises(ValueError, "{!}".format)
Eric Smith8ec90442009-03-14 12:29:34 +0000986 self.assertRaises(IndexError, "{:}".format)
987 self.assertRaises(IndexError, "{:s}".format)
988 self.assertRaises(IndexError, "{}".format)
Benjamin Peterson59a1b2f2010-06-07 22:31:26 +0000989 big = "23098475029384702983476098230754973209482573"
990 self.assertRaises(ValueError, ("{" + big + "}").format)
991 self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
Eric Smith8c663262007-08-25 02:26:07 +0000992
Eric Smith41669ca2009-05-23 14:23:22 +0000993 # issue 6089
994 self.assertRaises(ValueError, "{0[0]x}".format, [None])
995 self.assertRaises(ValueError, "{0[0](10)}".format, [None])
996
Eric Smith8c663262007-08-25 02:26:07 +0000997 # can't have a replacement on the field name portion
998 self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
999
1000 # exceed maximum recursion depth
1001 self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
1002 self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1003 0, 1, 2, 3, 4, 5, 6, 7)
1004
1005 # string format spec errors
1006 self.assertRaises(ValueError, "{0:-s}".format, '')
1007 self.assertRaises(ValueError, format, "", "-")
1008 self.assertRaises(ValueError, "{0:=s}".format, '')
1009
Eric Smithb1ebcc62008-07-15 13:02:41 +00001010 # Alternate formatting is not supported
1011 self.assertRaises(ValueError, format, '', '#')
1012 self.assertRaises(ValueError, format, '', '#20')
1013
Victor Stinnerece58de2012-04-23 23:36:38 +02001014 # Non-ASCII
1015 self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"),
1016 'ABC\u0410\u0411\u0412')
1017 self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"),
1018 'ABC')
1019 self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"),
1020 '')
1021
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001022 self.assertEqual("{[{}]}".format({"{}": 5}), "5")
Benjamin Peterson4d944742013-05-17 18:22:31 -05001023 self.assertEqual("{[{}]}".format({"{}" : "a"}), "a")
1024 self.assertEqual("{[{]}".format({"{" : "a"}), "a")
1025 self.assertEqual("{[}]}".format({"}" : "a"}), "a")
1026 self.assertEqual("{[[]}".format({"[" : "a"}), "a")
1027 self.assertEqual("{[!]}".format({"!" : "a"}), "a")
1028 self.assertRaises(ValueError, "{a{}b}".format, 42)
1029 self.assertRaises(ValueError, "{a{b}".format, 42)
1030 self.assertRaises(ValueError, "{[}".format, 42)
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001031
Benjamin Peterson0ee22bf2013-11-26 19:22:36 -06001032 self.assertEqual("0x{:0{:d}X}".format(0x0,16), "0x0000000000000000")
Benjamin Petersond2b58a92013-05-17 17:34:30 -05001033
Eric Smith27bbca62010-11-04 17:06:58 +00001034 def test_format_map(self):
1035 self.assertEqual(''.format_map({}), '')
1036 self.assertEqual('a'.format_map({}), 'a')
1037 self.assertEqual('ab'.format_map({}), 'ab')
1038 self.assertEqual('a{{'.format_map({}), 'a{')
1039 self.assertEqual('a}}'.format_map({}), 'a}')
1040 self.assertEqual('{{b'.format_map({}), '{b')
1041 self.assertEqual('}}b'.format_map({}), '}b')
1042 self.assertEqual('a{{b'.format_map({}), 'a{b')
1043
1044 # using mappings
1045 class Mapping(dict):
1046 def __missing__(self, key):
1047 return key
1048 self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
1049 self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
1050
1051 class InternalMapping:
1052 def __init__(self):
1053 self.mapping = {'a': 'hello'}
1054 def __getitem__(self, key):
1055 return self.mapping[key]
1056 self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
1057
1058
Eric Smith27bbca62010-11-04 17:06:58 +00001059 class C:
1060 def __init__(self, x=100):
1061 self._x = x
1062 def __format__(self, spec):
1063 return spec
Eric Smith27bbca62010-11-04 17:06:58 +00001064 self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
1065
1066 # test various errors
Eric V. Smithedbb6ca2012-03-12 15:16:22 -07001067 self.assertRaises(TypeError, ''.format_map)
1068 self.assertRaises(TypeError, 'a'.format_map)
1069
1070 self.assertRaises(ValueError, '{'.format_map, {})
1071 self.assertRaises(ValueError, '}'.format_map, {})
1072 self.assertRaises(ValueError, 'a{'.format_map, {})
1073 self.assertRaises(ValueError, 'a}'.format_map, {})
1074 self.assertRaises(ValueError, '{a'.format_map, {})
1075 self.assertRaises(ValueError, '}a'.format_map, {})
Eric Smith27bbca62010-11-04 17:06:58 +00001076
Eric V. Smith12ebefc2011-07-18 14:03:41 -04001077 # issue #12579: can't supply positional params to format_map
1078 self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
1079 self.assertRaises(ValueError, '{}'.format_map, 'a')
1080 self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
1081
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001082 def test_format_huge_precision(self):
1083 format_string = ".{}f".format(sys.maxsize + 1)
1084 with self.assertRaises(ValueError):
1085 result = format(2.34, format_string)
1086
1087 def test_format_huge_width(self):
1088 format_string = "{}f".format(sys.maxsize + 1)
1089 with self.assertRaises(ValueError):
1090 result = format(2.34, format_string)
1091
1092 def test_format_huge_item_number(self):
1093 format_string = "{{{}:.6f}}".format(sys.maxsize + 1)
1094 with self.assertRaises(ValueError):
1095 result = format_string.format(2.34)
1096
Eric Smith8ec90442009-03-14 12:29:34 +00001097 def test_format_auto_numbering(self):
1098 class C:
1099 def __init__(self, x=100):
1100 self._x = x
1101 def __format__(self, spec):
1102 return spec
1103
1104 self.assertEqual('{}'.format(10), '10')
1105 self.assertEqual('{:5}'.format('s'), 's ')
1106 self.assertEqual('{!r}'.format('s'), "'s'")
1107 self.assertEqual('{._x}'.format(C(10)), '10')
1108 self.assertEqual('{[1]}'.format([1, 2]), '2')
1109 self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
1110 self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
1111
1112 self.assertEqual('a{:{}}b'.format('x', '^10'), 'a x b')
1113 self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
1114
1115 # can't mix and match numbering and auto-numbering
1116 self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
1117 self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
1118 self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
1119 self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
1120
1121 # can mix and match auto-numbering and named
1122 self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
1123 self.assertEqual('{}{f}'.format(4, f='test'), '4test')
1124 self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
1125 self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
1126
Walter Dörwald28256f22003-01-19 16:59:20 +00001127 def test_formatting(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +00001128 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
Walter Dörwald28256f22003-01-19 16:59:20 +00001129 # Testing Unicode formatting strings...
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001130 self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
1131 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000, 3.00')
1132 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000, 3.00')
1133 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000, 3.50')
1134 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000, 3.57')
1135 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
Walter Dörwald28256f22003-01-19 16:59:20 +00001136 if not sys.platform.startswith('java'):
Walter Dörwald67e83882007-05-05 12:26:27 +00001137 self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
Georg Brandl559e5d72008-06-11 18:37:52 +00001138 self.assertEqual("%r" % ("\u1234",), "'\u1234'")
1139 self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001140 self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
1141 self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
Walter Dörwald56fbcb52003-03-31 18:18:41 +00001142
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001143 self.assertEqual('%c' % 0x1234, '\u1234')
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00001144 self.assertEqual('%c' % 0x21483, '\U00021483')
1145 self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
1146 self.assertEqual('%c' % '\U00021483', '\U00021483')
1147 self.assertRaises(TypeError, "%c".__mod__, "aa")
Stefan Krah99212f62010-07-19 17:58:26 +00001148 self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -07001149 self.assertRaises(TypeError, "%i".__mod__, "aa")
Walter Dörwald28256f22003-01-19 16:59:20 +00001150
1151 # formatting jobs delegated from the string implementation:
Walter Dörwald28256f22003-01-19 16:59:20 +00001152 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001153 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1154 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1155 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1156 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1157 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1158 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
1159 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
1160 self.assertEqual('...%s...' % "abc", '...abc...')
1161 self.assertEqual('%*s' % (5,'abc',), ' abc')
1162 self.assertEqual('%*s' % (-5,'abc',), 'abc ')
1163 self.assertEqual('%*.*s' % (5,2,'abc',), ' ab')
1164 self.assertEqual('%*.*s' % (5,3,'abc',), ' abc')
1165 self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10 abc')
1166 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103 abc')
1167 self.assertEqual('%c' % 'a', 'a')
Neil Schemenauercf52c072005-08-12 17:34:58 +00001168 class Wrapper:
1169 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001170 return '\u1234'
1171 self.assertEqual('%s' % Wrapper(), '\u1234')
Walter Dörwald28256f22003-01-19 16:59:20 +00001172
Eric Smith741191f2009-05-06 13:08:15 +00001173 # issue 3382
1174 NAN = float('nan')
1175 INF = float('inf')
1176 self.assertEqual('%f' % NAN, 'nan')
1177 self.assertEqual('%F' % NAN, 'NAN')
1178 self.assertEqual('%f' % INF, 'inf')
1179 self.assertEqual('%F' % INF, 'INF')
1180
Victor Stinnerf59c28c2012-05-09 03:24:14 +02001181 # PEP 393
1182 self.assertEqual('%.1s' % "a\xe9\u20ac", 'a')
1183 self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9')
1184
Ethan Furmandf3ed242014-01-05 06:50:30 -08001185 #issue 19995
Ethan Furman9ab74802014-03-21 06:38:46 -07001186 class PseudoInt:
Ethan Furmandf3ed242014-01-05 06:50:30 -08001187 def __init__(self, value):
1188 self.value = int(value)
1189 def __int__(self):
1190 return self.value
1191 def __index__(self):
1192 return self.value
Ethan Furman9ab74802014-03-21 06:38:46 -07001193 class PseudoFloat:
Ethan Furmandf3ed242014-01-05 06:50:30 -08001194 def __init__(self, value):
1195 self.value = float(value)
1196 def __int__(self):
1197 return int(self.value)
Ethan Furman9ab74802014-03-21 06:38:46 -07001198 pi = PseudoFloat(3.1415)
1199 letter_m = PseudoInt(109)
Antoine Pitroueb168042014-01-10 00:02:38 +01001200 self.assertEqual('%x' % 42, '2a')
1201 self.assertEqual('%X' % 15, 'F')
1202 self.assertEqual('%o' % 9, '11')
1203 self.assertEqual('%c' % 109, 'm')
1204 self.assertEqual('%x' % letter_m, '6d')
1205 self.assertEqual('%X' % letter_m, '6D')
1206 self.assertEqual('%o' % letter_m, '155')
1207 self.assertEqual('%c' % letter_m, 'm')
Ethan Furman9ab74802014-03-21 06:38:46 -07001208 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14),
1209 self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11),
1210 self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79),
1211 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi),
1212 self.assertRaises(TypeError, operator.mod, '%c', pi),
Ethan Furmandf3ed242014-01-05 06:50:30 -08001213
Ethan Furmanfb137212013-08-31 10:18:55 -07001214 def test_formatting_with_enum(self):
1215 # issue18780
1216 import enum
1217 class Float(float, enum.Enum):
1218 PI = 3.1415926
1219 class Int(enum.IntEnum):
1220 IDES = 15
1221 class Str(str, enum.Enum):
1222 ABC = 'abc'
1223 # Testing Unicode formatting strings...
Ethan Furman13bdfa72013-08-31 12:48:51 -07001224 self.assertEqual("%s, %s" % (Str.ABC, Str.ABC),
1225 'Str.ABC, Str.ABC')
1226 self.assertEqual("%s, %s, %d, %i, %u, %f, %5.2f" %
1227 (Str.ABC, Str.ABC,
1228 Int.IDES, Int.IDES, Int.IDES,
1229 Float.PI, Float.PI),
1230 'Str.ABC, Str.ABC, 15, 15, 15, 3.141593, 3.14')
Ethan Furmanfb137212013-08-31 10:18:55 -07001231
1232 # formatting jobs delegated from the string implementation:
Ethan Furman13bdfa72013-08-31 12:48:51 -07001233 self.assertEqual('...%(foo)s...' % {'foo':Str.ABC},
1234 '...Str.ABC...')
1235 self.assertEqual('...%(foo)s...' % {'foo':Int.IDES},
1236 '...Int.IDES...')
1237 self.assertEqual('...%(foo)i...' % {'foo':Int.IDES},
1238 '...15...')
1239 self.assertEqual('...%(foo)d...' % {'foo':Int.IDES},
1240 '...15...')
1241 self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI},
1242 '...15...')
1243 self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123},
1244 '...3.141593...')
Ethan Furmanfb137212013-08-31 10:18:55 -07001245
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001246 def test_formatting_huge_precision(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001247 format_string = "%.{}f".format(sys.maxsize + 1)
1248 with self.assertRaises(ValueError):
1249 result = format_string % 2.34
1250
1251 @support.cpython_only
1252 def test_formatting_huge_precision_c_limits(self):
Mark Dickinsonfb90c092012-10-28 10:18:03 +00001253 from _testcapi import INT_MAX
1254 format_string = "%.{}f".format(INT_MAX + 1)
1255 with self.assertRaises(ValueError):
1256 result = format_string % 2.34
1257
1258 def test_formatting_huge_width(self):
1259 format_string = "%{}f".format(sys.maxsize + 1)
1260 with self.assertRaises(ValueError):
1261 result = format_string % 2.34
1262
Ezio Melottiba42fd52011-04-26 06:09:45 +03001263 def test_startswith_endswith_errors(self):
1264 for meth in ('foo'.startswith, 'foo'.endswith):
Ezio Melottif2b3f782011-04-26 06:40:59 +03001265 with self.assertRaises(TypeError) as cm:
Ezio Melottiba42fd52011-04-26 06:09:45 +03001266 meth(['f'])
Ezio Melottif2b3f782011-04-26 06:40:59 +03001267 exc = str(cm.exception)
Ezio Melottiba42fd52011-04-26 06:09:45 +03001268 self.assertIn('str', exc)
1269 self.assertIn('tuple', exc)
1270
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001271 @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
Georg Brandlda6b1072006-01-20 17:48:54 +00001272 def test_format_float(self):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001273 # should not format with a comma, but always with C locale
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001274 self.assertEqual('1.0', '%.1f' % 1.0)
Georg Brandlda6b1072006-01-20 17:48:54 +00001275
Walter Dörwald28256f22003-01-19 16:59:20 +00001276 def test_constructor(self):
1277 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1278
1279 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001280 str('unicode remains unicode'),
1281 'unicode remains unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001282 )
1283
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001284 class UnicodeSubclass(str):
Marc-André Lemburg79f57832002-12-29 19:44:06 +00001285 pass
Guido van Rossuma831cac2000-03-10 23:23:21 +00001286
Victor Stinner07ac3eb2011-10-01 16:16:43 +02001287 for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
1288 subclass = UnicodeSubclass(text)
1289 self.assertEqual(str(subclass), text)
1290 self.assertEqual(len(subclass), len(text))
1291 if text == 'ascii':
1292 self.assertEqual(subclass.encode('ascii'), b'ascii')
1293 self.assertEqual(subclass.encode('utf-8'), b'ascii')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001294
Walter Dörwald28256f22003-01-19 16:59:20 +00001295 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001296 str('strings are converted to unicode'),
1297 'strings are converted to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001298 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001299
Walter Dörwald28256f22003-01-19 16:59:20 +00001300 class StringCompat:
1301 def __init__(self, x):
1302 self.x = x
1303 def __str__(self):
1304 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001305
Walter Dörwald28256f22003-01-19 16:59:20 +00001306 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001307 str(StringCompat('__str__ compatible objects are recognized')),
1308 '__str__ compatible objects are recognized'
Walter Dörwald28256f22003-01-19 16:59:20 +00001309 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001310
Walter Dörwald28256f22003-01-19 16:59:20 +00001311 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001312
Walter Dörwald28256f22003-01-19 16:59:20 +00001313 o = StringCompat('unicode(obj) is compatible to str()')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001314 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Walter Dörwald28256f22003-01-19 16:59:20 +00001315 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001316
Guido van Rossume2a383d2007-01-15 16:59:06 +00001317 for obj in (123, 123.45, 123):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001318 self.assertEqual(str(obj), str(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001319
Walter Dörwald28256f22003-01-19 16:59:20 +00001320 # unicode(obj, encoding, error) tests (this maps to
1321 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001322
Walter Dörwald28256f22003-01-19 16:59:20 +00001323 if not sys.platform.startswith('java'):
1324 self.assertRaises(
1325 TypeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001326 str,
1327 'decoding unicode is not supported',
Walter Dörwald28256f22003-01-19 16:59:20 +00001328 'utf-8',
1329 'strict'
1330 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001331
Walter Dörwald28256f22003-01-19 16:59:20 +00001332 self.assertEqual(
Walter Dörwald67e83882007-05-05 12:26:27 +00001333 str(b'strings are decoded to unicode', 'utf-8', 'strict'),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001334 'strings are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001335 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001336
Walter Dörwald28256f22003-01-19 16:59:20 +00001337 if not sys.platform.startswith('java'):
1338 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001339 str(
Guido van Rossumbae07c92007-10-08 02:46:15 +00001340 memoryview(b'character buffers are decoded to unicode'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001341 'utf-8',
1342 'strict'
1343 ),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001344 'character buffers are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001345 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001346
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001347 self.assertRaises(TypeError, str, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001348
Chris Jerdonek5fae0e52012-11-20 17:45:51 -08001349 def test_constructor_keyword_args(self):
1350 """Pass various keyword argument combinations to the constructor."""
1351 # The object argument can be passed as a keyword.
1352 self.assertEqual(str(object='foo'), 'foo')
1353 self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo')
1354 # The errors argument without encoding triggers "decode" mode.
1355 self.assertEqual(str(b'foo', errors='strict'), 'foo') # not "b'foo'"
1356 self.assertEqual(str(object=b'foo', errors='strict'), 'foo')
1357
1358 def test_constructor_defaults(self):
1359 """Check the constructor argument defaults."""
1360 # The object argument defaults to '' or b''.
1361 self.assertEqual(str(), '')
1362 self.assertEqual(str(errors='strict'), '')
1363 utf8_cent = '¢'.encode('utf-8')
1364 # The encoding argument defaults to utf-8.
1365 self.assertEqual(str(utf8_cent, errors='strict'), '¢')
1366 # The errors argument defaults to strict.
1367 self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii')
1368
Walter Dörwald28256f22003-01-19 16:59:20 +00001369 def test_codecs_utf7(self):
1370 utfTests = [
Walter Dörwald67e83882007-05-05 12:26:27 +00001371 ('A\u2262\u0391.', b'A+ImIDkQ.'), # RFC2152 example
1372 ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'), # RFC2152 example
1373 ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'), # RFC2152 example
1374 ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1375 ('+', b'+-'),
1376 ('+-', b'+--'),
1377 ('+?', b'+-?'),
1378 ('\?', b'+AFw?'),
1379 ('+?', b'+-?'),
1380 (r'\\?', b'+AFwAXA?'),
1381 (r'\\\?', b'+AFwAXABc?'),
Antoine Pitrou244651a2009-05-04 18:56:13 +00001382 (r'++--', b'+-+---'),
1383 ('\U000abcde', b'+2m/c3g-'), # surrogate pairs
1384 ('/', b'/'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001385 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001386
Walter Dörwald28256f22003-01-19 16:59:20 +00001387 for (x, y) in utfTests:
1388 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001389
Antoine Pitrou5418ee02011-11-15 01:42:21 +01001390 # Unpaired surrogates are passed through
1391 self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
1392 self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
1393 self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
1394 self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
1395 self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
1396 self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
1397 self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
1398 self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001399
Antoine Pitrou5418ee02011-11-15 01:42:21 +01001400 self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
1401 self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001402
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001403 # Issue #2242: crash on some Windows/MSVC versions
Antoine Pitrou244651a2009-05-04 18:56:13 +00001404 self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1')
1405
1406 # Direct encoded characters
1407 set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1408 # Optional direct characters
1409 set_o = '!"#$%&*;<=>@[]^_`{|}'
1410 for c in set_d:
1411 self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1412 self.assertEqual(c.encode('ascii').decode('utf7'), c)
1413 for c in set_o:
1414 self.assertEqual(c.encode('ascii').decode('utf7'), c)
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001415
Walter Dörwald28256f22003-01-19 16:59:20 +00001416 def test_codecs_utf8(self):
Walter Dörwald67e83882007-05-05 12:26:27 +00001417 self.assertEqual(''.encode('utf-8'), b'')
1418 self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
Ezio Melottia9860ae2011-10-04 19:06:00 +03001419 self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82')
1420 self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96')
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001421 self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1422 self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
Ezio Melottia9860ae2011-10-04 19:06:00 +03001423 self.assertEqual(('\U00010002'*10).encode('utf-8'),
1424 b'\xf0\x90\x80\x82'*10)
Walter Dörwald28256f22003-01-19 16:59:20 +00001425 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001426 '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1427 '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1428 '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1429 '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1430 '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1431 ' Nunstuck git und'.encode('utf-8'),
Walter Dörwald67e83882007-05-05 12:26:27 +00001432 b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1433 b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1434 b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1435 b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1436 b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1437 b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1438 b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1439 b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1440 b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1441 b'\xe3\x80\x8cWenn ist das Nunstuck git und'
Walter Dörwald28256f22003-01-19 16:59:20 +00001442 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001443
Walter Dörwald28256f22003-01-19 16:59:20 +00001444 # UTF-8 specific decoding tests
Walter Dörwald67e83882007-05-05 12:26:27 +00001445 self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1446 self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1447 self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001448
Walter Dörwald28256f22003-01-19 16:59:20 +00001449 # Other possible utf-8 test cases:
1450 # * strict decoding testing for all of the
1451 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001452
Ezio Melotti57221d02010-07-01 07:32:02 +00001453 def test_utf8_decode_valid_sequences(self):
1454 sequences = [
1455 # single byte
1456 (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1457 # 2 bytes
1458 (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1459 # 3 bytes
1460 (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1461 (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1462 # 4 bytes
1463 (b'\xF0\x90\x80\x80', '\U00010000'),
1464 (b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1465 ]
1466 for seq, res in sequences:
1467 self.assertEqual(seq.decode('utf-8'), res)
1468
1469
1470 def test_utf8_decode_invalid_sequences(self):
1471 # continuation bytes in a sequence of 2, 3, or 4 bytes
1472 continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001473 # start bytes of a 2-byte sequence equivalent to code points < 0x7F
Ezio Melotti57221d02010-07-01 07:32:02 +00001474 invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001475 # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
Ezio Melotti57221d02010-07-01 07:32:02 +00001476 invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1477 invalid_start_bytes = (
1478 continuation_bytes + invalid_2B_seq_start_bytes +
1479 invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1480 )
1481
1482 for byte in invalid_start_bytes:
1483 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1484
1485 for sb in invalid_2B_seq_start_bytes:
1486 for cb in continuation_bytes:
1487 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1488
1489 for sb in invalid_4B_seq_start_bytes:
1490 for cb1 in continuation_bytes[:3]:
1491 for cb3 in continuation_bytes[:3]:
1492 self.assertRaises(UnicodeDecodeError,
1493 (sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1494
1495 for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1496 self.assertRaises(UnicodeDecodeError,
1497 (b'\xE0'+cb+b'\x80').decode, 'utf-8')
1498 self.assertRaises(UnicodeDecodeError,
1499 (b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1500 # surrogates
1501 for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1502 self.assertRaises(UnicodeDecodeError,
1503 (b'\xED'+cb+b'\x80').decode, 'utf-8')
1504 self.assertRaises(UnicodeDecodeError,
1505 (b'\xED'+cb+b'\xBF').decode, 'utf-8')
1506 for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1507 self.assertRaises(UnicodeDecodeError,
1508 (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1509 self.assertRaises(UnicodeDecodeError,
1510 (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1511 for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1512 self.assertRaises(UnicodeDecodeError,
1513 (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1514 self.assertRaises(UnicodeDecodeError,
1515 (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1516
1517 def test_issue8271(self):
1518 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1519 # only the start byte and the continuation byte(s) are now considered
1520 # invalid, instead of the number of bytes specified by the start byte.
1521 # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1522 # table 3-8, Row 2) for more information about the algorithm used.
1523 FFFD = '\ufffd'
1524 sequences = [
1525 # invalid start bytes
1526 (b'\x80', FFFD), # continuation byte
1527 (b'\x80\x80', FFFD*2), # 2 continuation bytes
1528 (b'\xc0', FFFD),
1529 (b'\xc0\xc0', FFFD*2),
1530 (b'\xc1', FFFD),
1531 (b'\xc1\xc0', FFFD*2),
1532 (b'\xc0\xc1', FFFD*2),
1533 # with start byte of a 2-byte sequence
1534 (b'\xc2', FFFD), # only the start byte
1535 (b'\xc2\xc2', FFFD*2), # 2 start bytes
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001536 (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes
Ezio Melotti57221d02010-07-01 07:32:02 +00001537 (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1538 # with start byte of a 3-byte sequence
1539 (b'\xe1', FFFD), # only the start byte
1540 (b'\xe1\xe1', FFFD*2), # 2 start bytes
1541 (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1542 (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1543 (b'\xe1\x80', FFFD), # only 1 continuation byte
1544 (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1545 (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1546 (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1547 (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1548 (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1549 (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1550 # with start byte of a 4-byte sequence
1551 (b'\xf1', FFFD), # only the start byte
1552 (b'\xf1\xf1', FFFD*2), # 2 start bytes
1553 (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1554 (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1555 (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1556 (b'\xf1\x80', FFFD), # only 1 continuation bytes
1557 (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1558 (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1559 (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1560 (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1561 (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1562 (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1563 (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1564 (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1565 (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1566 (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1567 (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1568 (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1569 # with invalid start byte of a 4-byte sequence (rfc2279)
1570 (b'\xf5', FFFD), # only the start byte
1571 (b'\xf5\xf5', FFFD*2), # 2 start bytes
1572 (b'\xf5\x80', FFFD*2), # only 1 continuation byte
1573 (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1574 (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1575 (b'\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
1576 (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1577 (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1578 # with invalid start byte of a 5-byte sequence (rfc2279)
1579 (b'\xf8', FFFD), # only the start byte
1580 (b'\xf8\xf8', FFFD*2), # 2 start bytes
1581 (b'\xf8\x80', FFFD*2), # only one continuation byte
1582 (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1583 (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1584 # with invalid start byte of a 6-byte sequence (rfc2279)
1585 (b'\xfc', FFFD), # only the start byte
1586 (b'\xfc\xfc', FFFD*2), # 2 start bytes
1587 (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1588 (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1589 # invalid start byte
1590 (b'\xfe', FFFD),
1591 (b'\xfe\x80\x80', FFFD*3),
1592 # other sequences
1593 (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1594 (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1595 (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1596 (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1597 '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1598 ]
1599 for n, (seq, res) in enumerate(sequences):
1600 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1601 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1602 self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1603 self.assertEqual(seq.decode('utf-8', 'ignore'),
1604 res.replace('\uFFFD', ''))
1605
Ezio Melottif7ed5d12012-11-04 23:21:38 +02001606 def to_bytestring(self, seq):
1607 return bytes(int(c, 16) for c in seq.split())
1608
1609 def assertCorrectUTF8Decoding(self, seq, res, err):
1610 """
1611 Check that an invalid UTF-8 sequence raises an UnicodeDecodeError when
1612 'strict' is used, returns res when 'replace' is used, and that doesn't
1613 return anything when 'ignore' is used.
1614 """
1615 with self.assertRaises(UnicodeDecodeError) as cm:
1616 seq.decode('utf-8')
1617 exc = cm.exception
1618
1619 self.assertIn(err, str(exc))
1620 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1621 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'),
1622 'aaaa' + res + 'bbbb')
1623 res = res.replace('\ufffd', '')
1624 self.assertEqual(seq.decode('utf-8', 'ignore'), res)
1625 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'),
1626 'aaaa' + res + 'bbbb')
1627
1628 def test_invalid_start_byte(self):
1629 """
1630 Test that an 'invalid start byte' error is raised when the first byte
1631 is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
1632 4-bytes sequence. The invalid start byte is replaced with a single
1633 U+FFFD when errors='replace'.
1634 E.g. <80> is a continuation byte and can appear only after a start byte.
1635 """
1636 FFFD = '\ufffd'
1637 for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
1638 self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd',
1639 'invalid start byte')
1640
1641 def test_unexpected_end_of_data(self):
1642 """
1643 Test that an 'unexpected end of data' error is raised when the string
1644 ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
1645 enough continuation bytes. The incomplete sequence is replaced with a
1646 single U+FFFD when errors='replace'.
1647 E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
1648 sequence, but it's followed by only 2 valid continuation bytes and the
1649 last continuation bytes is missing.
1650 Note: the continuation bytes must be all valid, if one of them is
1651 invalid another error will be raised.
1652 """
1653 sequences = [
1654 'C2', 'DF',
1655 'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
1656 'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
1657 'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
1658 'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
1659 'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
1660 'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
1661 ]
1662 FFFD = '\ufffd'
1663 for seq in sequences:
1664 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), '\ufffd',
1665 'unexpected end of data')
1666
1667 def test_invalid_cb_for_2bytes_seq(self):
1668 """
1669 Test that an 'invalid continuation byte' error is raised when the
1670 continuation byte of a 2-bytes sequence is invalid. The start byte
1671 is replaced by a single U+FFFD and the second byte is handled
1672 separately when errors='replace'.
1673 E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
1674 sequence, but 41 is not a valid continuation byte because it's the
1675 ASCII letter 'A'.
1676 """
1677 FFFD = '\ufffd'
1678 FFFDx2 = FFFD * 2
1679 sequences = [
1680 ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'),
1681 ('C2 C0', FFFDx2), ('C2 FF', FFFDx2),
1682 ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'),
1683 ('DF C0', FFFDx2), ('DF FF', FFFDx2),
1684 ]
1685 for seq, res in sequences:
1686 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
1687 'invalid continuation byte')
1688
1689 def test_invalid_cb_for_3bytes_seq(self):
1690 """
1691 Test that an 'invalid continuation byte' error is raised when the
1692 continuation byte(s) of a 3-bytes sequence are invalid. When
1693 errors='replace', if the first continuation byte is valid, the first
1694 two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
1695 third byte is handled separately, otherwise only the start byte is
1696 replaced with a U+FFFD and the other continuation bytes are handled
1697 separately.
1698 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1699 sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1700 because it's the ASCII letter 'A'.
1701 Note: when the start byte is E0 or ED, the valid ranges for the first
1702 continuation byte are limited to A0..BF and 80..9F respectively.
1703 Python 2 used to consider all the bytes in range 80..BF valid when the
1704 start byte was ED. This is fixed in Python 3.
1705 """
1706 FFFD = '\ufffd'
1707 FFFDx2 = FFFD * 2
1708 sequences = [
1709 ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2),
1710 ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
1711 ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'),
1712 ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
1713 ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'),
1714 ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'),
1715 ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
1716 ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'),
1717 ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
1718 ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'),
1719 ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'),
1720 ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
1721 ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'),
1722 ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
1723 ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'),
1724 ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'),
1725 ('ED 7F', FFFD+'\x7f'),
1726 ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
1727 ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'),
1728 ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2),
1729 ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'),
1730 ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2),
1731 ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'),
1732 ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
1733 ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'),
1734 ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
1735 ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'),
1736 ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'),
1737 ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
1738 ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'),
1739 ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
1740 ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'),
1741 ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
1742 ]
1743 for seq, res in sequences:
1744 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
1745 'invalid continuation byte')
1746
1747 def test_invalid_cb_for_4bytes_seq(self):
1748 """
1749 Test that an 'invalid continuation byte' error is raised when the
1750 continuation byte(s) of a 4-bytes sequence are invalid. When
1751 errors='replace',the start byte and all the following valid
1752 continuation bytes are replaced with a single U+FFFD, and all the bytes
1753 starting from the first invalid continuation bytes (included) are
1754 handled separately.
1755 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1756 sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1757 because it's the ASCII letter 'A'.
1758 Note: when the start byte is E0 or ED, the valid ranges for the first
1759 continuation byte are limited to A0..BF and 80..9F respectively.
1760 However, when the start byte is ED, Python 2 considers all the bytes
1761 in range 80..BF valid. This is fixed in Python 3.
1762 """
1763 FFFD = '\ufffd'
1764 FFFDx2 = FFFD * 2
1765 sequences = [
1766 ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2),
1767 ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
1768 ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'),
1769 ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
1770 ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'),
1771 ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
1772 ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'),
1773 ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
1774 ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'),
1775 ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
1776 ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'),
1777 ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
1778 ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'),
1779 ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
1780 ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2),
1781 ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'),
1782 ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2),
1783 ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'),
1784 ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2),
1785 ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'),
1786 ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2),
1787 ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'),
1788 ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2),
1789 ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'),
1790 ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2),
1791 ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'),
1792 ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2),
1793 ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'),
1794 ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
1795 ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'),
1796 ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
1797 ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'),
1798 ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
1799 ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'),
1800 ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
1801 ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'),
1802 ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
1803 ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'),
1804 ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
1805 ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'),
1806 ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
1807 ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2),
1808 ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
1809 ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'),
1810 ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
1811 ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'),
1812 ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
1813 ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'),
1814 ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
1815 ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'),
1816 ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
1817 ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'),
1818 ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
1819 ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'),
1820 ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
1821 ]
1822 for seq, res in sequences:
1823 self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
1824 'invalid continuation byte')
1825
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00001826 def test_codecs_idna(self):
1827 # Test whether trailing dot is preserved
Walter Dörwald1324c6f2007-05-11 19:57:05 +00001828 self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00001829
Walter Dörwald28256f22003-01-19 16:59:20 +00001830 def test_codecs_errors(self):
1831 # Error handling (encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001832 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
1833 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
Walter Dörwald67e83882007-05-05 12:26:27 +00001834 self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
1835 self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
Benjamin Peterson308d6372009-09-18 21:42:35 +00001836 self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
1837 'Andr\202 x'.encode('ascii', errors='replace'))
1838 self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
1839 'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001840
Walter Dörwald28256f22003-01-19 16:59:20 +00001841 # Error handling (decoding)
Walter Dörwald67e83882007-05-05 12:26:27 +00001842 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
1843 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
1844 self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
1845 self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001846
Walter Dörwald28256f22003-01-19 16:59:20 +00001847 # Error handling (unknown character names)
Guido van Rossum39478e82007-08-27 17:23:59 +00001848 self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001849
Walter Dörwald28256f22003-01-19 16:59:20 +00001850 # Error handling (truncated escape sequence)
Guido van Rossum9c627722007-08-27 18:31:48 +00001851 self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00001852
Guido van Rossum9c627722007-08-27 18:31:48 +00001853 self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
1854 self.assertRaises(TypeError, str, b"hello", "test.unicode2")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001855 self.assertRaises(TypeError, "hello".encode, "test.unicode1")
1856 self.assertRaises(TypeError, "hello".encode, "test.unicode2")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00001857
Walter Dörwald28256f22003-01-19 16:59:20 +00001858 # Error handling (wrong arguments)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001859 self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001860
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001861 # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001862 self.assertRaises(UnicodeError, float, "\ud800")
1863 self.assertRaises(UnicodeError, float, "\udf00")
1864 self.assertRaises(UnicodeError, complex, "\ud800")
1865 self.assertRaises(UnicodeError, complex, "\udf00")
Guido van Rossum97064862000-04-10 13:52:48 +00001866
Walter Dörwald28256f22003-01-19 16:59:20 +00001867 def test_codecs(self):
1868 # Encoding
Walter Dörwald67e83882007-05-05 12:26:27 +00001869 self.assertEqual('hello'.encode('ascii'), b'hello')
1870 self.assertEqual('hello'.encode('utf-7'), b'hello')
1871 self.assertEqual('hello'.encode('utf-8'), b'hello')
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001872 self.assertEqual('hello'.encode('utf-8'), b'hello')
Walter Dörwald67e83882007-05-05 12:26:27 +00001873 self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
1874 self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
1875 self.assertEqual('hello'.encode('latin-1'), b'hello')
Guido van Rossum97064862000-04-10 13:52:48 +00001876
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001877 # Default encoding is utf-8
1878 self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
1879
Walter Dörwald28256f22003-01-19 16:59:20 +00001880 # Roundtrip safety for BMP (just the first 1024 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00001881 for c in range(1024):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00001882 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001883 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
1884 'utf-16-be', 'raw_unicode_escape',
1885 'unicode_escape', 'unicode_internal'):
Victor Stinner040e16e2011-11-15 22:44:05 +01001886 with warnings.catch_warnings():
1887 # unicode-internal has been deprecated
1888 warnings.simplefilter("ignore", DeprecationWarning)
1889
1890 self.assertEqual(str(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001891
Walter Dörwald28256f22003-01-19 16:59:20 +00001892 # Roundtrip safety for BMP (just the first 256 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00001893 for c in range(256):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00001894 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001895 for encoding in ('latin-1',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001896 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001897
Walter Dörwald28256f22003-01-19 16:59:20 +00001898 # Roundtrip safety for BMP (just the first 128 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00001899 for c in range(128):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00001900 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001901 for encoding in ('ascii',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001902 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001903
Walter Dörwald28256f22003-01-19 16:59:20 +00001904 # Roundtrip safety for non-BMP (just a few chars)
Victor Stinner040e16e2011-11-15 22:44:05 +01001905 with warnings.catch_warnings():
1906 # unicode-internal has been deprecated
1907 warnings.simplefilter("ignore", DeprecationWarning)
1908
1909 u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
1910 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
1911 'raw_unicode_escape',
1912 'unicode_escape', 'unicode_internal'):
1913 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001914
Antoine Pitrou51f66482011-11-11 13:35:44 +01001915 # UTF-8 must be roundtrip safe for all code points
1916 # (except surrogates, which are forbidden).
1917 u = ''.join(map(chr, list(range(0, 0xd800)) +
Ezio Melotti40dc9192011-11-11 17:00:46 +02001918 list(range(0xe000, 0x110000))))
Walter Dörwald28256f22003-01-19 16:59:20 +00001919 for encoding in ('utf-8',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001920 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001921
Walter Dörwald28256f22003-01-19 16:59:20 +00001922 def test_codecs_charmap(self):
1923 # 0-127
Guido van Rossum805365e2007-05-07 22:24:25 +00001924 s = bytes(range(128))
Walter Dörwald28256f22003-01-19 16:59:20 +00001925 for encoding in (
Andrew Kuchlingad8156e2013-11-10 13:44:30 -05001926 'cp037', 'cp1026', 'cp273',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001927 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1928 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001929 'cp863', 'cp865', 'cp866', 'cp1125',
Walter Dörwald28256f22003-01-19 16:59:20 +00001930 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1931 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
1932 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
1933 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001934
Walter Dörwald28256f22003-01-19 16:59:20 +00001935 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1936 'cp1256', 'cp1257', 'cp1258',
1937 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001938
Walter Dörwald28256f22003-01-19 16:59:20 +00001939 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
1940 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +00001941
Walter Dörwald28256f22003-01-19 16:59:20 +00001942 ### These have undefined mappings:
1943 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +00001944
Walter Dörwald28256f22003-01-19 16:59:20 +00001945 ### These fail the round-trip:
1946 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +00001947
Walter Dörwald28256f22003-01-19 16:59:20 +00001948 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001949 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001950
Walter Dörwald28256f22003-01-19 16:59:20 +00001951 # 128-255
Guido van Rossum805365e2007-05-07 22:24:25 +00001952 s = bytes(range(128, 256))
Walter Dörwald28256f22003-01-19 16:59:20 +00001953 for encoding in (
Andrew Kuchlingad8156e2013-11-10 13:44:30 -05001954 'cp037', 'cp1026', 'cp273',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001955 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1956 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001957 'cp863', 'cp865', 'cp866', 'cp1125',
Walter Dörwald28256f22003-01-19 16:59:20 +00001958 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1959 'iso8859_2', 'iso8859_4', 'iso8859_5',
1960 'iso8859_9', 'koi8_r', 'latin_1',
1961 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +00001962
Walter Dörwald28256f22003-01-19 16:59:20 +00001963 ### These have undefined mappings:
1964 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1965 #'cp1256', 'cp1257', 'cp1258',
1966 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
1967 #'iso8859_3', 'iso8859_6', 'iso8859_7',
1968 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +00001969
Walter Dörwald28256f22003-01-19 16:59:20 +00001970 ### These fail the round-trip:
1971 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +00001972
Walter Dörwald28256f22003-01-19 16:59:20 +00001973 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001974 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001975
Walter Dörwald28256f22003-01-19 16:59:20 +00001976 def test_concatenation(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001977 self.assertEqual(("abc" "def"), "abcdef")
1978 self.assertEqual(("abc" "def"), "abcdef")
1979 self.assertEqual(("abc" "def"), "abcdef")
1980 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
1981 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +00001982
Walter Dörwald28256f22003-01-19 16:59:20 +00001983 def test_printing(self):
1984 class BitBucket:
1985 def write(self, text):
1986 pass
Fred Drake004d5e62000-10-23 17:22:08 +00001987
Walter Dörwald28256f22003-01-19 16:59:20 +00001988 out = BitBucket()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001989 print('abc', file=out)
1990 print('abc', 'def', file=out)
1991 print('abc', 'def', file=out)
1992 print('abc', 'def', file=out)
1993 print('abc\n', file=out)
1994 print('abc\n', end=' ', file=out)
1995 print('abc\n', end=' ', file=out)
1996 print('def\n', file=out)
1997 print('def\n', file=out)
Fred Drake004d5e62000-10-23 17:22:08 +00001998
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00001999 def test_ucs4(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002000 x = '\U00100000'
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002001 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
2002 self.assertEqual(x, y)
2003
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002004 y = br'\U00100000'
2005 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2006 self.assertEqual(x, y)
2007 y = br'\U00010000'
2008 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2009 self.assertEqual(x, y)
Christian Heimesfe337bf2008-03-23 21:54:12 +00002010
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002011 try:
2012 br'\U11111111'.decode("raw-unicode-escape")
2013 except UnicodeDecodeError as e:
2014 self.assertEqual(e.start, 0)
2015 self.assertEqual(e.end, 10)
2016 else:
2017 self.fail("Should have raised UnicodeDecodeError")
Christian Heimesfe337bf2008-03-23 21:54:12 +00002018
Brett Cannonc3647ac2005-04-26 03:45:26 +00002019 def test_conversion(self):
2020 # Make sure __unicode__() works properly
2021 class Foo0:
2022 def __str__(self):
2023 return "foo"
2024
2025 class Foo1:
Guido van Rossum98297ee2007-11-06 21:34:58 +00002026 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002027 return "foo"
Brett Cannonc3647ac2005-04-26 03:45:26 +00002028
2029 class Foo2(object):
Guido van Rossum98297ee2007-11-06 21:34:58 +00002030 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002031 return "foo"
Brett Cannonc3647ac2005-04-26 03:45:26 +00002032
2033 class Foo3(object):
Guido van Rossum98297ee2007-11-06 21:34:58 +00002034 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002035 return "foo"
2036
2037 class Foo4(str):
Guido van Rossum98297ee2007-11-06 21:34:58 +00002038 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002039 return "foo"
2040
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002041 class Foo5(str):
Guido van Rossum98297ee2007-11-06 21:34:58 +00002042 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002043 return "foo"
2044
2045 class Foo6(str):
2046 def __str__(self):
2047 return "foos"
2048
Guido van Rossum98297ee2007-11-06 21:34:58 +00002049 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002050 return "foou"
Brett Cannonc3647ac2005-04-26 03:45:26 +00002051
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002052 class Foo7(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002053 def __str__(self):
2054 return "foos"
Guido van Rossum98297ee2007-11-06 21:34:58 +00002055 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002056 return "foou"
Brett Cannonc3647ac2005-04-26 03:45:26 +00002057
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002058 class Foo8(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002059 def __new__(cls, content=""):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002060 return str.__new__(cls, 2*content)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002061 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002062 return self
2063
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002064 class Foo9(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002065 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00002066 return "not unicode"
2067
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002068 self.assertEqual(str(Foo0()), "foo")
2069 self.assertEqual(str(Foo1()), "foo")
2070 self.assertEqual(str(Foo2()), "foo")
2071 self.assertEqual(str(Foo3()), "foo")
2072 self.assertEqual(str(Foo4("bar")), "foo")
2073 self.assertEqual(str(Foo5("bar")), "foo")
2074 self.assertEqual(str(Foo6("bar")), "foou")
2075 self.assertEqual(str(Foo7("bar")), "foou")
2076 self.assertEqual(str(Foo8("foo")), "foofoo")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002077 self.assertEqual(str(Foo9("foo")), "not unicode")
Brett Cannonc3647ac2005-04-26 03:45:26 +00002078
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002079 def test_unicode_repr(self):
2080 class s1:
2081 def __repr__(self):
2082 return '\\n'
2083
2084 class s2:
2085 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002086 return '\\n'
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002087
2088 self.assertEqual(repr(s1()), '\\n')
2089 self.assertEqual(repr(s2()), '\\n')
2090
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002091 def test_printable_repr(self):
2092 self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
Martin v. Löwisbaecd722010-10-11 22:42:28 +00002093 self.assertEqual(repr('\U00014000'), "'\\U00014000'") # nonprintable
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002094
Zachary Ware9fe6d862013-12-08 00:20:35 -06002095 # This test only affects 32-bit platforms because expandtabs can only take
2096 # an int as the max value, not a 64-bit C long. If expandtabs is changed
2097 # to take a 64-bit long, this test should apply to all platforms.
2098 @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4,
2099 'only applies to 32-bit platforms')
Guido van Rossumcd16bf62007-06-13 18:07:49 +00002100 def test_expandtabs_overflows_gracefully(self):
Christian Heimesa37d4c62007-12-04 23:02:19 +00002101 self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002102
Victor Stinner1d972ad2011-10-07 13:31:46 +02002103 @support.cpython_only
Antoine Pitroue19aa382011-10-04 16:04:01 +02002104 def test_expandtabs_optimization(self):
2105 s = 'abc'
2106 self.assertIs(s.expandtabs(), s)
2107
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00002108 def test_raiseMemError(self):
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002109 if struct.calcsize('P') == 8:
2110 # 64 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02002111 ascii_struct_size = 48
2112 compact_struct_size = 72
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002113 else:
2114 # 32 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02002115 ascii_struct_size = 24
2116 compact_struct_size = 36
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002117
2118 for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
2119 code = ord(char)
2120 if code < 0x100:
2121 char_size = 1 # sizeof(Py_UCS1)
2122 struct_size = ascii_struct_size
2123 elif code < 0x10000:
2124 char_size = 2 # sizeof(Py_UCS2)
2125 struct_size = compact_struct_size
2126 else:
2127 char_size = 4 # sizeof(Py_UCS4)
2128 struct_size = compact_struct_size
2129 # Note: sys.maxsize is half of the actual max allocation because of
Martin v. Löwis287eca62011-09-28 10:03:28 +02002130 # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
2131 # be allocatable, given enough memory.
2132 maxlen = ((sys.maxsize - struct_size) // char_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002133 alloc = lambda: char * maxlen
2134 self.assertRaises(MemoryError, alloc)
2135 self.assertRaises(MemoryError, alloc)
Antoine Pitrou3db3e872008-08-17 17:06:51 +00002136
Victor Stinner808fc0a2010-03-22 12:50:40 +00002137 def test_format_subclass(self):
2138 class S(str):
2139 def __str__(self):
2140 return '__str__ overridden'
2141 s = S('xxx')
Florent Xiclunaa87b3832010-09-13 02:28:18 +00002142 self.assertEqual("%s" % s, '__str__ overridden')
2143 self.assertEqual("{}".format(s), '__str__ overridden')
Victor Stinner808fc0a2010-03-22 12:50:40 +00002144
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002145 # Test PyUnicode_FromFormat()
Victor Stinner1205f272010-09-11 00:54:47 +00002146 def test_from_format(self):
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002147 support.import_module('ctypes')
Victor Stinner15a11362012-10-06 23:48:20 +02002148 from ctypes import (
2149 pythonapi, py_object, sizeof,
Victor Stinner6d970f42011-03-02 00:04:25 +00002150 c_int, c_long, c_longlong, c_ssize_t,
Victor Stinner15a11362012-10-06 23:48:20 +02002151 c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002152 name = "PyUnicode_FromFormat"
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002153 _PyUnicode_FromFormat = getattr(pythonapi, name)
2154 _PyUnicode_FromFormat.restype = py_object
2155
2156 def PyUnicode_FromFormat(format, *args):
2157 cargs = tuple(
2158 py_object(arg) if isinstance(arg, str) else arg
2159 for arg in args)
2160 return _PyUnicode_FromFormat(format, *cargs)
Victor Stinner1205f272010-09-11 00:54:47 +00002161
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002162 def check_format(expected, format, *args):
2163 text = PyUnicode_FromFormat(format, *args)
2164 self.assertEqual(expected, text)
2165
Victor Stinner1205f272010-09-11 00:54:47 +00002166 # ascii format, non-ascii argument
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002167 check_format('ascii\x7f=unicode\xe9',
2168 b'ascii\x7f=%U', 'unicode\xe9')
Victor Stinner1205f272010-09-11 00:54:47 +00002169
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002170 # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
2171 # raises an error
Ezio Melottied3a7d22010-12-01 02:32:32 +00002172 self.assertRaisesRegex(ValueError,
Victor Stinner1205f272010-09-11 00:54:47 +00002173 '^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
Victor Stinner4c7db312010-09-12 07:51:18 +00002174 'string, got a non-ASCII byte: 0xe9$',
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00002175 PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00002176
Victor Stinner96865452011-03-01 23:44:09 +00002177 # test "%c"
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002178 check_format('\uabcd',
2179 b'%c', c_int(0xabcd))
2180 check_format('\U0010ffff',
2181 b'%c', c_int(0x10ffff))
Serhiy Storchaka8eeae212013-06-23 20:12:14 +03002182 with self.assertRaises(OverflowError):
2183 PyUnicode_FromFormat(b'%c', c_int(0x110000))
Serhiy Storchaka31b1c8b2013-06-12 09:20:44 +03002184 # Issue #18183
Serhiy Storchakaf15ffe02013-06-12 09:28:20 +03002185 check_format('\U00010000\U00100000',
2186 b'%c%c', c_int(0x10000), c_int(0x100000))
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002187
Victor Stinner96865452011-03-01 23:44:09 +00002188 # test "%"
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002189 check_format('%',
2190 b'%')
2191 check_format('%',
2192 b'%%')
2193 check_format('%s',
2194 b'%%s')
2195 check_format('[%]',
2196 b'[%%]')
2197 check_format('%abc',
2198 b'%%%s', b'abc')
2199
2200 # truncated string
2201 check_format('abc',
2202 b'%.3s', b'abcdef')
2203 check_format('abc[\ufffd',
2204 b'%.5s', 'abc[\u20ac]'.encode('utf8'))
2205 check_format("'\\u20acABC'",
2206 b'%A', '\u20acABC')
2207 check_format("'\\u20",
2208 b'%.5A', '\u20acABCDEF')
2209 check_format("'\u20acABC'",
2210 b'%R', '\u20acABC')
2211 check_format("'\u20acA",
2212 b'%.3R', '\u20acABCDEF')
2213 check_format('\u20acAB',
2214 b'%.3S', '\u20acABCDEF')
2215 check_format('\u20acAB',
2216 b'%.3U', '\u20acABCDEF')
2217 check_format('\u20acAB',
2218 b'%.3V', '\u20acABCDEF', None)
2219 check_format('abc[\ufffd',
2220 b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
2221
2222 # following tests comes from #7330
2223 # test width modifier and precision modifier with %S
2224 check_format("repr= abc",
2225 b'repr=%5S', 'abc')
2226 check_format("repr=ab",
2227 b'repr=%.2S', 'abc')
2228 check_format("repr= ab",
2229 b'repr=%5.2S', 'abc')
2230
2231 # test width modifier and precision modifier with %R
2232 check_format("repr= 'abc'",
2233 b'repr=%8R', 'abc')
2234 check_format("repr='ab",
2235 b'repr=%.3R', 'abc')
2236 check_format("repr= 'ab",
2237 b'repr=%5.3R', 'abc')
2238
2239 # test width modifier and precision modifier with %A
2240 check_format("repr= 'abc'",
2241 b'repr=%8A', 'abc')
2242 check_format("repr='ab",
2243 b'repr=%.3A', 'abc')
2244 check_format("repr= 'ab",
2245 b'repr=%5.3A', 'abc')
2246
2247 # test width modifier and precision modifier with %s
2248 check_format("repr= abc",
2249 b'repr=%5s', b'abc')
2250 check_format("repr=ab",
2251 b'repr=%.2s', b'abc')
2252 check_format("repr= ab",
2253 b'repr=%5.2s', b'abc')
2254
2255 # test width modifier and precision modifier with %U
2256 check_format("repr= abc",
2257 b'repr=%5U', 'abc')
2258 check_format("repr=ab",
2259 b'repr=%.2U', 'abc')
2260 check_format("repr= ab",
2261 b'repr=%5.2U', 'abc')
2262
2263 # test width modifier and precision modifier with %V
2264 check_format("repr= abc",
2265 b'repr=%5V', 'abc', b'123')
2266 check_format("repr=ab",
2267 b'repr=%.2V', 'abc', b'123')
2268 check_format("repr= ab",
2269 b'repr=%5.2V', 'abc', b'123')
2270 check_format("repr= 123",
2271 b'repr=%5V', None, b'123')
2272 check_format("repr=12",
2273 b'repr=%.2V', None, b'123')
2274 check_format("repr= 12",
2275 b'repr=%5.2V', None, b'123')
Victor Stinner96865452011-03-01 23:44:09 +00002276
Victor Stinner6d970f42011-03-02 00:04:25 +00002277 # test integer formats (%i, %d, %u)
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002278 check_format('010',
2279 b'%03i', c_int(10))
2280 check_format('0010',
2281 b'%0.4i', c_int(10))
2282 check_format('-123',
2283 b'%i', c_int(-123))
2284 check_format('-123',
2285 b'%li', c_long(-123))
2286 check_format('-123',
2287 b'%lli', c_longlong(-123))
2288 check_format('-123',
2289 b'%zi', c_ssize_t(-123))
Victor Stinner96865452011-03-01 23:44:09 +00002290
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002291 check_format('-123',
2292 b'%d', c_int(-123))
2293 check_format('-123',
2294 b'%ld', c_long(-123))
2295 check_format('-123',
2296 b'%lld', c_longlong(-123))
2297 check_format('-123',
2298 b'%zd', c_ssize_t(-123))
Victor Stinner96865452011-03-01 23:44:09 +00002299
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002300 check_format('123',
2301 b'%u', c_uint(123))
2302 check_format('123',
2303 b'%lu', c_ulong(123))
2304 check_format('123',
2305 b'%llu', c_ulonglong(123))
2306 check_format('123',
2307 b'%zu', c_size_t(123))
Victor Stinner6d970f42011-03-02 00:04:25 +00002308
Victor Stinner15a11362012-10-06 23:48:20 +02002309 # test long output
2310 min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
2311 max_longlong = -min_longlong - 1
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002312 check_format(str(min_longlong),
2313 b'%lld', c_longlong(min_longlong))
2314 check_format(str(max_longlong),
2315 b'%lld', c_longlong(max_longlong))
Victor Stinner15a11362012-10-06 23:48:20 +02002316 max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002317 check_format(str(max_ulonglong),
2318 b'%llu', c_ulonglong(max_ulonglong))
Victor Stinner15a11362012-10-06 23:48:20 +02002319 PyUnicode_FromFormat(b'%p', c_void_p(-1))
2320
Victor Stinnere215d962012-10-06 23:03:36 +02002321 # test padding (width and/or precision)
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002322 check_format('123'.rjust(10, '0'),
2323 b'%010i', c_int(123))
2324 check_format('123'.rjust(100),
2325 b'%100i', c_int(123))
2326 check_format('123'.rjust(100, '0'),
2327 b'%.100i', c_int(123))
2328 check_format('123'.rjust(80, '0').rjust(100),
2329 b'%100.80i', c_int(123))
Victor Stinnere215d962012-10-06 23:03:36 +02002330
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002331 check_format('123'.rjust(10, '0'),
2332 b'%010u', c_uint(123))
2333 check_format('123'.rjust(100),
2334 b'%100u', c_uint(123))
2335 check_format('123'.rjust(100, '0'),
2336 b'%.100u', c_uint(123))
2337 check_format('123'.rjust(80, '0').rjust(100),
2338 b'%100.80u', c_uint(123))
Victor Stinnere215d962012-10-06 23:03:36 +02002339
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002340 check_format('123'.rjust(10, '0'),
2341 b'%010x', c_int(0x123))
2342 check_format('123'.rjust(100),
2343 b'%100x', c_int(0x123))
2344 check_format('123'.rjust(100, '0'),
2345 b'%.100x', c_int(0x123))
2346 check_format('123'.rjust(80, '0').rjust(100),
2347 b'%100.80x', c_int(0x123))
Victor Stinnere215d962012-10-06 23:03:36 +02002348
Victor Stinner6d970f42011-03-02 00:04:25 +00002349 # test %A
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002350 check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
2351 b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
Victor Stinner9a909002010-10-18 20:59:24 +00002352
Victor Stinner6d970f42011-03-02 00:04:25 +00002353 # test %V
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002354 check_format('repr=abc',
2355 b'repr=%V', 'abc', b'xyz')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002356
2357 # Test string decode from parameter of %s using utf-8.
2358 # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
2359 # '\u4eba\u6c11'
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002360 check_format('repr=\u4eba\u6c11',
2361 b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002362
2363 #Test replace error handler.
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002364 check_format('repr=abc\ufffd',
2365 b'repr=%V', None, b'abc\xff')
Victor Stinner2512a8b2011-03-01 22:46:52 +00002366
Victor Stinner6d970f42011-03-02 00:04:25 +00002367 # not supported: copy the raw format string. these tests are just here
2368 # to check for crashs and should not be considered as specifications
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002369 check_format('%s',
2370 b'%1%s', b'abc')
2371 check_format('%1abc',
2372 b'%1abc')
2373 check_format('%+i',
2374 b'%+i', c_int(10))
2375 check_format('%.%s',
2376 b'%.%s', b'abc')
Victor Stinner6d970f42011-03-02 00:04:25 +00002377
Victor Stinner1c24bd02010-10-02 11:03:13 +00002378 # Test PyUnicode_AsWideChar()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002379 @support.cpython_only
Victor Stinner1c24bd02010-10-02 11:03:13 +00002380 def test_aswidechar(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002381 from _testcapi import unicode_aswidechar
Antoine Pitrou0662bc22010-11-22 16:19:04 +00002382 support.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002383 from ctypes import c_wchar, sizeof
2384
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002385 wchar, size = unicode_aswidechar('abcdef', 2)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002386 self.assertEqual(size, 2)
2387 self.assertEqual(wchar, 'ab')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002388
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002389 wchar, size = unicode_aswidechar('abc', 3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002390 self.assertEqual(size, 3)
2391 self.assertEqual(wchar, 'abc')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002392
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002393 wchar, size = unicode_aswidechar('abc', 4)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002394 self.assertEqual(size, 3)
2395 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002396
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002397 wchar, size = unicode_aswidechar('abc', 10)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002398 self.assertEqual(size, 3)
2399 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002400
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002401 wchar, size = unicode_aswidechar('abc\0def', 20)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002402 self.assertEqual(size, 7)
2403 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002404
Victor Stinner5593d8a2010-10-02 11:11:27 +00002405 nonbmp = chr(0x10ffff)
2406 if sizeof(c_wchar) == 2:
2407 buflen = 3
2408 nchar = 2
2409 else: # sizeof(c_wchar) == 4
2410 buflen = 2
2411 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002412 wchar, size = unicode_aswidechar(nonbmp, buflen)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002413 self.assertEqual(size, nchar)
2414 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00002415
Victor Stinner1c24bd02010-10-02 11:03:13 +00002416 # Test PyUnicode_AsWideCharString()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002417 @support.cpython_only
Victor Stinner1c24bd02010-10-02 11:03:13 +00002418 def test_aswidecharstring(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002419 from _testcapi import unicode_aswidecharstring
Antoine Pitrou0662bc22010-11-22 16:19:04 +00002420 support.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002421 from ctypes import c_wchar, sizeof
2422
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002423 wchar, size = unicode_aswidecharstring('abc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00002424 self.assertEqual(size, 3)
2425 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002426
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002427 wchar, size = unicode_aswidecharstring('abc\0def')
Ezio Melottib3aedd42010-11-20 19:04:17 +00002428 self.assertEqual(size, 7)
2429 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00002430
Victor Stinner5593d8a2010-10-02 11:11:27 +00002431 nonbmp = chr(0x10ffff)
2432 if sizeof(c_wchar) == 2:
2433 nchar = 2
2434 else: # sizeof(c_wchar) == 4
2435 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00002436 wchar, size = unicode_aswidecharstring(nonbmp)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002437 self.assertEqual(size, nchar)
2438 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00002439
Benjamin Peterson811c2f12011-09-30 21:31:21 -04002440 def test_subclass_add(self):
2441 class S(str):
2442 def __add__(self, o):
2443 return "3"
2444 self.assertEqual(S("4") + S("5"), "3")
2445 class S(str):
2446 def __iadd__(self, o):
2447 return "3"
2448 s = S("1")
2449 s += "4"
2450 self.assertEqual(s, "3")
2451
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002452 @support.cpython_only
Victor Stinner42bf7752011-11-21 22:52:58 +01002453 def test_encode_decimal(self):
2454 from _testcapi import unicode_encodedecimal
2455 self.assertEqual(unicode_encodedecimal('123'),
2456 b'123')
2457 self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
2458 b'3.14')
2459 self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2460 b' 3.14 ')
2461 self.assertRaises(UnicodeEncodeError,
2462 unicode_encodedecimal, "123\u20ac", "strict")
Victor Stinner6345be92011-11-25 20:09:01 +01002463 self.assertRaisesRegex(
2464 ValueError,
2465 "^'decimal' codec can't encode character",
2466 unicode_encodedecimal, "123\u20ac", "replace")
Victor Stinner42bf7752011-11-21 22:52:58 +01002467
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002468 @support.cpython_only
Victor Stinner42bf7752011-11-21 22:52:58 +01002469 def test_transform_decimal(self):
2470 from _testcapi import unicode_transformdecimaltoascii as transform_decimal
2471 self.assertEqual(transform_decimal('123'),
2472 '123')
2473 self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
2474 '3.14')
2475 self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2476 "\N{EM SPACE}3.14\N{EN SPACE}")
2477 self.assertEqual(transform_decimal('123\u20ac'),
2478 '123\u20ac')
2479
Victor Stinnerc814a382011-11-22 01:06:15 +01002480 def test_getnewargs(self):
2481 text = 'abc'
2482 args = text.__getnewargs__()
2483 self.assertIsNot(args[0], text)
2484 self.assertEqual(args[0], text)
2485 self.assertEqual(len(args), 1)
2486
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01002487 def test_resize(self):
2488 for length in range(1, 100, 7):
2489 # generate a fresh string (refcount=1)
2490 text = 'a' * length + 'b'
2491
Ezio Melotti51e243f2013-02-20 23:56:01 +02002492 with support.check_warnings(('unicode_internal codec has been '
2493 'deprecated', DeprecationWarning)):
2494 # fill wstr internal field
2495 abc = text.encode('unicode_internal')
2496 self.assertEqual(abc.decode('unicode_internal'), text)
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01002497
Ezio Melotti51e243f2013-02-20 23:56:01 +02002498 # resize text: wstr field must be cleared and then recomputed
2499 text += 'c'
2500 abcdef = text.encode('unicode_internal')
2501 self.assertNotEqual(abc, abcdef)
2502 self.assertEqual(abcdef.decode('unicode_internal'), text)
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01002503
Victor Stinner9fc59812013-04-08 22:34:43 +02002504 def test_compare(self):
2505 # Issue #17615
2506 N = 10
2507 ascii = 'a' * N
2508 ascii2 = 'z' * N
2509 latin = '\x80' * N
2510 latin2 = '\xff' * N
2511 bmp = '\u0100' * N
2512 bmp2 = '\uffff' * N
2513 astral = '\U00100000' * N
2514 astral2 = '\U0010ffff' * N
2515 strings = (
2516 ascii, ascii2,
2517 latin, latin2,
2518 bmp, bmp2,
2519 astral, astral2)
2520 for text1, text2 in itertools.combinations(strings, 2):
2521 equal = (text1 is text2)
2522 self.assertEqual(text1 == text2, equal)
2523 self.assertEqual(text1 != text2, not equal)
2524
2525 if equal:
2526 self.assertTrue(text1 <= text2)
2527 self.assertTrue(text1 >= text2)
2528
2529 # text1 is text2: duplicate strings to skip the "str1 == str2"
2530 # optimization in unicode_compare_eq() and really compare
2531 # character per character
2532 copy1 = duplicate_string(text1)
2533 copy2 = duplicate_string(text2)
2534 self.assertIsNot(copy1, copy2)
2535
2536 self.assertTrue(copy1 == copy2)
2537 self.assertFalse(copy1 != copy2)
2538
2539 self.assertTrue(copy1 <= copy2)
2540 self.assertTrue(copy2 >= copy2)
2541
2542 self.assertTrue(ascii < ascii2)
2543 self.assertTrue(ascii < latin)
2544 self.assertTrue(ascii < bmp)
2545 self.assertTrue(ascii < astral)
2546 self.assertFalse(ascii >= ascii2)
2547 self.assertFalse(ascii >= latin)
2548 self.assertFalse(ascii >= bmp)
2549 self.assertFalse(ascii >= astral)
2550
2551 self.assertFalse(latin < ascii)
2552 self.assertTrue(latin < latin2)
2553 self.assertTrue(latin < bmp)
2554 self.assertTrue(latin < astral)
2555 self.assertTrue(latin >= ascii)
2556 self.assertFalse(latin >= latin2)
2557 self.assertFalse(latin >= bmp)
2558 self.assertFalse(latin >= astral)
2559
2560 self.assertFalse(bmp < ascii)
2561 self.assertFalse(bmp < latin)
2562 self.assertTrue(bmp < bmp2)
2563 self.assertTrue(bmp < astral)
2564 self.assertTrue(bmp >= ascii)
2565 self.assertTrue(bmp >= latin)
2566 self.assertFalse(bmp >= bmp2)
2567 self.assertFalse(bmp >= astral)
2568
2569 self.assertFalse(astral < ascii)
2570 self.assertFalse(astral < latin)
2571 self.assertFalse(astral < bmp2)
2572 self.assertTrue(astral < astral2)
2573 self.assertTrue(astral >= ascii)
2574 self.assertTrue(astral >= latin)
2575 self.assertTrue(astral >= bmp2)
2576 self.assertFalse(astral >= astral2)
2577
Victor Stinner1c24bd02010-10-02 11:03:13 +00002578
Eric Smitha1eac722011-01-29 11:15:35 +00002579class StringModuleTest(unittest.TestCase):
2580 def test_formatter_parser(self):
2581 def parse(format):
2582 return list(_string.formatter_parser(format))
2583
2584 formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
2585 self.assertEqual(formatter, [
2586 ('prefix ', '2', '', 's'),
2587 ('xxx', '0', '^+10.3f', None),
2588 ('', 'obj.attr', '', 's'),
2589 (' ', 'z[0]', '10', 's'),
2590 ])
2591
2592 formatter = parse("prefix {} suffix")
2593 self.assertEqual(formatter, [
2594 ('prefix ', '', '', None),
2595 (' suffix', None, None, None),
2596 ])
2597
2598 formatter = parse("str")
2599 self.assertEqual(formatter, [
2600 ('str', None, None, None),
2601 ])
2602
2603 formatter = parse("")
2604 self.assertEqual(formatter, [])
2605
2606 formatter = parse("{0}")
2607 self.assertEqual(formatter, [
2608 ('', '0', '', None),
2609 ])
2610
2611 self.assertRaises(TypeError, _string.formatter_parser, 1)
2612
2613 def test_formatter_field_name_split(self):
2614 def split(name):
2615 items = list(_string.formatter_field_name_split(name))
2616 items[1] = list(items[1])
2617 return items
2618 self.assertEqual(split("obj"), ["obj", []])
2619 self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
2620 self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
2621 self.assertEqual(split("obj.arg[key1][key2]"), [
2622 "obj",
2623 [(True, 'arg'),
2624 (False, 'key1'),
2625 (False, 'key2'),
2626 ]])
2627 self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
2628
2629
Walter Dörwald28256f22003-01-19 16:59:20 +00002630if __name__ == "__main__":
Ezio Melotti0dceb562013-01-10 07:43:26 +02002631 unittest.main()