blob: 14d3fa6269a5d9b2141e96cb662a0cccb4d1ad4c [file] [log] [blame]
Guido van Rossuma831cac2000-03-10 23:23:21 +00001""" Test script for the Unicode implementation.
2
Guido van Rossuma831cac2000-03-10 23:23:21 +00003Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
Marc-André Lemburg36619082001-01-17 19:11:13 +00007"""#"
Guido van Rossum98297ee2007-11-06 21:34:58 +00008import codecs
9import struct
10import sys
11import unittest
12import warnings
Benjamin Petersonee8712c2008-05-20 21:35:26 +000013from test import support, string_tests
Eric Smitha1eac722011-01-29 11:15:35 +000014import _string
Guido van Rossuma831cac2000-03-10 23:23:21 +000015
Neal Norwitz430f68b2005-11-24 22:00:56 +000016# Error handling (bad decoder return)
17def search_function(encoding):
18 def decode1(input, errors="strict"):
19 return 42 # not a tuple
20 def encode1(input, errors="strict"):
21 return 42 # not a tuple
22 def encode2(input, errors="strict"):
23 return (42, 42) # no unicode
24 def decode2(input, errors="strict"):
25 return (42, 42) # no unicode
26 if encoding=="test.unicode1":
27 return (encode1, decode1, None, None)
28 elif encoding=="test.unicode2":
29 return (encode2, decode2, None, None)
30 else:
31 return None
32codecs.register(search_function)
33
Brett Cannon226b2302010-03-20 22:22:22 +000034class UnicodeTest(string_tests.CommonTest,
35 string_tests.MixinStrUnicodeUserStringTest,
36 string_tests.MixinStrUnicodeTest):
37
Guido van Rossumef87d6e2007-05-02 19:09:54 +000038 type2test = str
Walter Dörwald0fd583c2003-02-21 12:53:50 +000039
40 def checkequalnofix(self, result, object, methodname, *args):
41 method = getattr(object, methodname)
42 realresult = method(*args)
43 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000044 self.assertTrue(type(realresult) is type(result))
Walter Dörwald0fd583c2003-02-21 12:53:50 +000045
46 # if the original is returned make sure that
47 # this doesn't happen with subclasses
48 if realresult is object:
Guido van Rossumef87d6e2007-05-02 19:09:54 +000049 class usub(str):
Walter Dörwald0fd583c2003-02-21 12:53:50 +000050 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000051 return 'usub(%r)' % str.__repr__(self)
Walter Dörwald0fd583c2003-02-21 12:53:50 +000052 object = usub(object)
53 method = getattr(object, methodname)
54 realresult = method(*args)
55 self.assertEqual(realresult, result)
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000056 self.assertTrue(object is not realresult)
Guido van Rossume4874ae2001-09-21 15:36:41 +000057
Jeremy Hylton504de6b2003-10-06 05:08:26 +000058 def test_literals(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000059 self.assertEqual('\xff', '\u00ff')
60 self.assertEqual('\uffff', '\U0000ffff')
Guido van Rossum36e0a922007-07-20 04:05:57 +000061 self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
62 self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
63 self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
Benjamin Petersoncd76c272008-04-05 15:09:30 +000064 # raw strings should not have unicode escapes
Florent Xiclunaa87b3832010-09-13 02:28:18 +000065 self.assertNotEqual(r"\u0020", " ")
Jeremy Hylton504de6b2003-10-06 05:08:26 +000066
Georg Brandl559e5d72008-06-11 18:37:52 +000067 def test_ascii(self):
68 if not sys.platform.startswith('java'):
69 # Test basic sanity of repr()
70 self.assertEqual(ascii('abc'), "'abc'")
71 self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
72 self.assertEqual(ascii('ab\\'), "'ab\\\\'")
73 self.assertEqual(ascii('\\c'), "'\\\\c'")
74 self.assertEqual(ascii('\\'), "'\\\\'")
75 self.assertEqual(ascii('\n'), "'\\n'")
76 self.assertEqual(ascii('\r'), "'\\r'")
77 self.assertEqual(ascii('\t'), "'\\t'")
78 self.assertEqual(ascii('\b'), "'\\x08'")
79 self.assertEqual(ascii("'\""), """'\\'"'""")
80 self.assertEqual(ascii("'\""), """'\\'"'""")
81 self.assertEqual(ascii("'"), '''"'"''')
82 self.assertEqual(ascii('"'), """'"'""")
83 latin1repr = (
84 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
85 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
86 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
87 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
88 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
89 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
90 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
91 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
92 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
93 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
94 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
95 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
96 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
97 "\\xfe\\xff'")
98 testrepr = ascii(''.join(map(chr, range(256))))
99 self.assertEqual(testrepr, latin1repr)
100 # Test ascii works on wide unicode escapes without overflow.
101 self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
102 ascii("\U00010000" * 39 + "\uffff" * 4096))
103
104 class WrongRepr:
105 def __repr__(self):
106 return b'byte-repr'
107 self.assertRaises(TypeError, ascii, WrongRepr())
108
Walter Dörwald28256f22003-01-19 16:59:20 +0000109 def test_repr(self):
110 if not sys.platform.startswith('java'):
111 # Test basic sanity of repr()
Walter Dörwald67e83882007-05-05 12:26:27 +0000112 self.assertEqual(repr('abc'), "'abc'")
113 self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
114 self.assertEqual(repr('ab\\'), "'ab\\\\'")
115 self.assertEqual(repr('\\c'), "'\\\\c'")
116 self.assertEqual(repr('\\'), "'\\\\'")
117 self.assertEqual(repr('\n'), "'\\n'")
118 self.assertEqual(repr('\r'), "'\\r'")
119 self.assertEqual(repr('\t'), "'\\t'")
120 self.assertEqual(repr('\b'), "'\\x08'")
121 self.assertEqual(repr("'\""), """'\\'"'""")
122 self.assertEqual(repr("'\""), """'\\'"'""")
123 self.assertEqual(repr("'"), '''"'"''')
124 self.assertEqual(repr('"'), """'"'""")
Walter Dörwald28256f22003-01-19 16:59:20 +0000125 latin1repr = (
Walter Dörwald67e83882007-05-05 12:26:27 +0000126 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
Walter Dörwald28256f22003-01-19 16:59:20 +0000127 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
128 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
129 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
130 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
131 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
Georg Brandl559e5d72008-06-11 18:37:52 +0000132 "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
133 "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
134 "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
135 "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
136 "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
137 "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
138 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
139 "\xfe\xff'")
Guido van Rossum805365e2007-05-07 22:24:25 +0000140 testrepr = repr(''.join(map(chr, range(256))))
Walter Dörwald28256f22003-01-19 16:59:20 +0000141 self.assertEqual(testrepr, latin1repr)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000142 # Test repr works on wide unicode escapes without overflow.
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000143 self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
144 repr("\U00010000" * 39 + "\uffff" * 4096))
Walter Dörwald28256f22003-01-19 16:59:20 +0000145
Georg Brandl559e5d72008-06-11 18:37:52 +0000146 class WrongRepr:
147 def __repr__(self):
148 return b'byte-repr'
149 self.assertRaises(TypeError, repr, WrongRepr())
150
Guido van Rossum49d6b072006-08-17 21:11:47 +0000151 def test_iterators(self):
152 # Make sure unicode objects have an __iter__ method
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000153 it = "\u1111\u2222\u3333".__iter__()
154 self.assertEqual(next(it), "\u1111")
155 self.assertEqual(next(it), "\u2222")
156 self.assertEqual(next(it), "\u3333")
Georg Brandla18af4e2007-04-21 15:47:16 +0000157 self.assertRaises(StopIteration, next, it)
Guido van Rossum49d6b072006-08-17 21:11:47 +0000158
Walter Dörwald28256f22003-01-19 16:59:20 +0000159 def test_count(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000160 string_tests.CommonTest.test_count(self)
161 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000162 self.checkequalnofix(3, 'aaa', 'count', 'a')
163 self.checkequalnofix(0, 'aaa', 'count', 'b')
164 self.checkequalnofix(3, 'aaa', 'count', 'a')
165 self.checkequalnofix(0, 'aaa', 'count', 'b')
166 self.checkequalnofix(0, 'aaa', 'count', 'b')
167 self.checkequalnofix(1, 'aaa', 'count', 'a', -1)
168 self.checkequalnofix(3, 'aaa', 'count', 'a', -10)
169 self.checkequalnofix(2, 'aaa', 'count', 'a', 0, -1)
170 self.checkequalnofix(0, 'aaa', 'count', 'a', 0, -10)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000171
Walter Dörwald28256f22003-01-19 16:59:20 +0000172 def test_find(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000173 self.checkequalnofix(0, 'abcdefghiabc', 'find', 'abc')
174 self.checkequalnofix(9, 'abcdefghiabc', 'find', 'abc', 1)
175 self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000176
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000177 self.assertRaises(TypeError, 'hello'.find)
178 self.assertRaises(TypeError, 'hello'.find, 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000179
Walter Dörwald28256f22003-01-19 16:59:20 +0000180 def test_rfind(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000181 string_tests.CommonTest.test_rfind(self)
182 # check mixed argument types
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000183 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', 'abc')
184 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
185 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
Guido van Rossum8b264542000-12-19 02:22:31 +0000186
Walter Dörwald28256f22003-01-19 16:59:20 +0000187 def test_index(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000188 string_tests.CommonTest.test_index(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000189 self.checkequalnofix(0, 'abcdefghiabc', 'index', '')
190 self.checkequalnofix(3, 'abcdefghiabc', 'index', 'def')
191 self.checkequalnofix(0, 'abcdefghiabc', 'index', 'abc')
192 self.checkequalnofix(9, 'abcdefghiabc', 'index', 'abc', 1)
193 self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
194 self.assertRaises(ValueError, 'abcdefghiab'.index, 'abc', 1)
195 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', 8)
196 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000197
Walter Dörwald28256f22003-01-19 16:59:20 +0000198 def test_rindex(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000199 string_tests.CommonTest.test_rindex(self)
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000200 self.checkequalnofix(12, 'abcdefghiabc', 'rindex', '')
201 self.checkequalnofix(3, 'abcdefghiabc', 'rindex', 'def')
202 self.checkequalnofix(9, 'abcdefghiabc', 'rindex', 'abc')
203 self.checkequalnofix(0, 'abcdefghiabc', 'rindex', 'abc', 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000204
Walter Dörwaldaa97f042007-05-03 21:05:51 +0000205 self.assertRaises(ValueError, 'abcdefghiabc'.rindex, 'hib')
206 self.assertRaises(ValueError, 'defghiabc'.rindex, 'def', 1)
207 self.assertRaises(ValueError, 'defghiabc'.rindex, 'abc', 0, -1)
208 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, 8)
209 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, -1)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000210
Georg Brandlceee0772007-11-27 23:48:05 +0000211 def test_maketrans_translate(self):
212 # these work with plain translate()
213 self.checkequalnofix('bbbc', 'abababc', 'translate',
214 {ord('a'): None})
215 self.checkequalnofix('iiic', 'abababc', 'translate',
216 {ord('a'): None, ord('b'): ord('i')})
217 self.checkequalnofix('iiix', 'abababc', 'translate',
218 {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
219 self.checkequalnofix('c', 'abababc', 'translate',
220 {ord('a'): None, ord('b'): ''})
221 self.checkequalnofix('xyyx', 'xzx', 'translate',
222 {ord('z'): 'yy'})
223 # this needs maketrans()
224 self.checkequalnofix('abababc', 'abababc', 'translate',
225 {'b': '<i>'})
226 tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
227 self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
228 # test alternative way of calling maketrans()
229 tbl = self.type2test.maketrans('abc', 'xyz', 'd')
230 self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
231
232 self.assertRaises(TypeError, self.type2test.maketrans)
233 self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
234 self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
235 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
236 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
237 self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
238 self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
Guido van Rossuma831cac2000-03-10 23:23:21 +0000239
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000240 self.assertRaises(TypeError, 'hello'.translate)
Walter Dörwald67e83882007-05-05 12:26:27 +0000241 self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000242
Walter Dörwald28256f22003-01-19 16:59:20 +0000243 def test_split(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000244 string_tests.CommonTest.test_split(self)
Andrew M. Kuchlingeddd68d2002-03-29 16:21:44 +0000245
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000246 # Mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000247 self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
248 self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
249 self.checkequalnofix(['endcase ', ''], 'endcase test', 'split', 'test')
Guido van Rossuma831cac2000-03-10 23:23:21 +0000250
Walter Dörwald28256f22003-01-19 16:59:20 +0000251 def test_join(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000252 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +0000253
Guido van Rossumf1044292007-09-27 18:01:22 +0000254 class MyWrapper:
255 def __init__(self, sval): self.sval = sval
256 def __str__(self): return self.sval
257
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000258 # mixed arguments
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000259 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
260 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
261 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
262 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
263 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
264 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
265 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
Guido van Rossum98297ee2007-11-06 21:34:58 +0000266 self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
267 self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
268 self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
269 self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
Marc-André Lemburge5034372000-08-08 08:04:29 +0000270
Walter Dörwald28256f22003-01-19 16:59:20 +0000271 def test_replace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000272 string_tests.CommonTest.test_replace(self)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000273
Walter Dörwald28256f22003-01-19 16:59:20 +0000274 # method call forwarded from str implementation because of unicode argument
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000275 self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
276 self.assertRaises(TypeError, 'replace'.replace, "r", 42)
Guido van Rossuma831cac2000-03-10 23:23:21 +0000277
Victor Stinner59de0ee2011-10-07 10:01:28 +0200278 @support.cpython_only
279 def test_replace_id(self):
280 a = 'a' # single ascii letters are singletons
281 text = 'abc'
282 self.assertIs(text.replace('a', 'a'), text)
283
Guido van Rossum98297ee2007-11-06 21:34:58 +0000284 def test_bytes_comparison(self):
Brett Cannon226b2302010-03-20 22:22:22 +0000285 with support.check_warnings():
286 warnings.simplefilter('ignore', BytesWarning)
287 self.assertEqual('abc' == b'abc', False)
288 self.assertEqual('abc' != b'abc', True)
289 self.assertEqual('abc' == bytearray(b'abc'), False)
290 self.assertEqual('abc' != bytearray(b'abc'), True)
Brett Cannon40430012007-10-22 20:24:51 +0000291
Walter Dörwald28256f22003-01-19 16:59:20 +0000292 def test_comparison(self):
293 # Comparisons:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000294 self.assertEqual('abc', 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000295 self.assertTrue('abcd' > 'abc')
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000296 self.assertTrue('abc' < 'abcd')
Walter Dörwald28256f22003-01-19 16:59:20 +0000297
298 if 0:
299 # Move these tests to a Unicode collation module test...
300 # Testing UTF-16 code point order comparisons...
301
302 # No surrogates, no fixup required.
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000303 self.assertTrue('\u0061' < '\u20ac')
Walter Dörwald28256f22003-01-19 16:59:20 +0000304 # Non surrogate below surrogate value, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000305 self.assertTrue('\u0061' < '\ud800\udc02')
Walter Dörwald28256f22003-01-19 16:59:20 +0000306
307 # Non surrogate above surrogate value, fixup required
308 def test_lecmp(s, s2):
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000309 self.assertTrue(s < s2)
Walter Dörwald28256f22003-01-19 16:59:20 +0000310
311 def test_fixup(s):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000312 s2 = '\ud800\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000313 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000314 s2 = '\ud900\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000315 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000316 s2 = '\uda00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000317 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000318 s2 = '\udb00\udc01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000319 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000320 s2 = '\ud800\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000321 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000322 s2 = '\ud900\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000323 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000324 s2 = '\uda00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000325 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000326 s2 = '\udb00\udd01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000327 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000328 s2 = '\ud800\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000329 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000330 s2 = '\ud900\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000331 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000332 s2 = '\uda00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000333 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000334 s2 = '\udb00\ude01'
Walter Dörwald28256f22003-01-19 16:59:20 +0000335 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000336 s2 = '\ud800\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000337 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000338 s2 = '\ud900\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000339 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000340 s2 = '\uda00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000341 test_lecmp(s, s2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000342 s2 = '\udb00\udfff'
Walter Dörwald28256f22003-01-19 16:59:20 +0000343 test_lecmp(s, s2)
344
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000345 test_fixup('\ue000')
346 test_fixup('\uff61')
Walter Dörwald28256f22003-01-19 16:59:20 +0000347
348 # Surrogates on both sides, no fixup required
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000349 self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
Walter Dörwald28256f22003-01-19 16:59:20 +0000350
Walter Dörwald28256f22003-01-19 16:59:20 +0000351 def test_islower(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000352 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000353 self.checkequalnofix(False, '\u1FFc', 'islower')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300354 # non-BMP, uppercase
355 self.assertFalse('\U00010401'.islower())
356 self.assertFalse('\U00010427'.islower())
357 # non-BMP, lowercase
358 self.assertTrue('\U00010429'.islower())
359 self.assertTrue('\U0001044E'.islower())
360 # non-BMP, non-cased
361 self.assertFalse('\U0001F40D'.islower())
362 self.assertFalse('\U0001F46F'.islower())
Walter Dörwald28256f22003-01-19 16:59:20 +0000363
364 def test_isupper(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000365 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
366 if not sys.platform.startswith('java'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000367 self.checkequalnofix(False, '\u1FFc', 'isupper')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300368 # non-BMP, uppercase
369 self.assertTrue('\U00010401'.isupper())
370 self.assertTrue('\U00010427'.isupper())
371 # non-BMP, lowercase
372 self.assertFalse('\U00010429'.isupper())
373 self.assertFalse('\U0001044E'.isupper())
374 # non-BMP, non-cased
375 self.assertFalse('\U0001F40D'.isupper())
376 self.assertFalse('\U0001F46F'.isupper())
Walter Dörwald28256f22003-01-19 16:59:20 +0000377
378 def test_istitle(self):
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300379 string_tests.MixinStrUnicodeUserStringTest.test_istitle(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000380 self.checkequalnofix(True, '\u1FFc', 'istitle')
381 self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
Walter Dörwald28256f22003-01-19 16:59:20 +0000382
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300383 # non-BMP, uppercase + lowercase
384 self.assertTrue('\U00010401\U00010429'.istitle())
385 self.assertTrue('\U00010427\U0001044E'.istitle())
386 # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
387 for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
388 self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
389
Walter Dörwald28256f22003-01-19 16:59:20 +0000390 def test_isspace(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000391 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000392 self.checkequalnofix(True, '\u2000', 'isspace')
393 self.checkequalnofix(True, '\u200a', 'isspace')
394 self.checkequalnofix(False, '\u2014', 'isspace')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300395 # apparently there are no non-BMP spaces chars in Unicode 6
396 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
397 '\U0001F40D', '\U0001F46F']:
398 self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
399
400 def test_isalnum(self):
401 string_tests.MixinStrUnicodeUserStringTest.test_isalnum(self)
402 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
403 '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
404 self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
Walter Dörwald28256f22003-01-19 16:59:20 +0000405
406 def test_isalpha(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000407 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000408 self.checkequalnofix(True, '\u1FFc', 'isalpha')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300409 # non-BMP, cased
410 self.assertTrue('\U00010401'.isalpha())
411 self.assertTrue('\U00010427'.isalpha())
412 self.assertTrue('\U00010429'.isalpha())
413 self.assertTrue('\U0001044E'.isalpha())
414 # non-BMP, non-cased
415 self.assertFalse('\U0001F40D'.isalpha())
416 self.assertFalse('\U0001F46F'.isalpha())
Walter Dörwald28256f22003-01-19 16:59:20 +0000417
418 def test_isdecimal(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000419 self.checkequalnofix(False, '', 'isdecimal')
420 self.checkequalnofix(False, 'a', 'isdecimal')
421 self.checkequalnofix(True, '0', 'isdecimal')
422 self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
423 self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
424 self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
425 self.checkequalnofix(True, '0123456789', 'isdecimal')
426 self.checkequalnofix(False, '0123456789a', 'isdecimal')
Walter Dörwald28256f22003-01-19 16:59:20 +0000427
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000428 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000429
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300430 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
431 '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
432 self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
433 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
434 self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
435
Walter Dörwald28256f22003-01-19 16:59:20 +0000436 def test_isdigit(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000437 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000438 self.checkequalnofix(True, '\u2460', 'isdigit')
439 self.checkequalnofix(False, '\xbc', 'isdigit')
440 self.checkequalnofix(True, '\u0660', 'isdigit')
Walter Dörwald28256f22003-01-19 16:59:20 +0000441
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300442 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
443 '\U0001F40D', '\U0001F46F', '\U00011065']:
444 self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
445 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
446 self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
447
Walter Dörwald28256f22003-01-19 16:59:20 +0000448 def test_isnumeric(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000449 self.checkequalnofix(False, '', 'isnumeric')
450 self.checkequalnofix(False, 'a', 'isnumeric')
451 self.checkequalnofix(True, '0', 'isnumeric')
452 self.checkequalnofix(True, '\u2460', 'isnumeric')
453 self.checkequalnofix(True, '\xbc', 'isnumeric')
454 self.checkequalnofix(True, '\u0660', 'isnumeric')
455 self.checkequalnofix(True, '0123456789', 'isnumeric')
456 self.checkequalnofix(False, '0123456789a', 'isnumeric')
Walter Dörwald28256f22003-01-19 16:59:20 +0000457
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000458 self.assertRaises(TypeError, "abc".isnumeric, 42)
Walter Dörwald28256f22003-01-19 16:59:20 +0000459
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300460 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
461 '\U0001F40D', '\U0001F46F']:
462 self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
463 for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
464 '\U000104A0', '\U0001F107']:
465 self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
466
Martin v. Löwis47383402007-08-15 07:32:56 +0000467 def test_isidentifier(self):
468 self.assertTrue("a".isidentifier())
469 self.assertTrue("Z".isidentifier())
470 self.assertTrue("_".isidentifier())
471 self.assertTrue("b0".isidentifier())
472 self.assertTrue("bc".isidentifier())
473 self.assertTrue("b_".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000474 self.assertTrue("µ".isidentifier())
Benjamin Petersonf413b802011-08-12 22:17:18 -0500475 self.assertTrue("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000476
477 self.assertFalse(" ".isidentifier())
478 self.assertFalse("[".isidentifier())
Antoine Pitroud72402e2010-10-27 18:52:48 +0000479 self.assertFalse("©".isidentifier())
Georg Brandld52429f2008-07-04 15:55:02 +0000480 self.assertFalse("0".isidentifier())
Martin v. Löwis47383402007-08-15 07:32:56 +0000481
Georg Brandl559e5d72008-06-11 18:37:52 +0000482 def test_isprintable(self):
483 self.assertTrue("".isprintable())
Benjamin Peterson09832742009-03-26 17:15:46 +0000484 self.assertTrue(" ".isprintable())
Georg Brandl559e5d72008-06-11 18:37:52 +0000485 self.assertTrue("abcdefg".isprintable())
486 self.assertFalse("abcdefg\n".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000487 # some defined Unicode character
488 self.assertTrue("\u0374".isprintable())
489 # undefined character
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +0000490 self.assertFalse("\u0378".isprintable())
Georg Brandld52429f2008-07-04 15:55:02 +0000491 # single surrogate character
Georg Brandl559e5d72008-06-11 18:37:52 +0000492 self.assertFalse("\ud800".isprintable())
493
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300494 self.assertTrue('\U0001F46F'.isprintable())
495 self.assertFalse('\U000E0020'.isprintable())
496
497 def test_surrogates(self):
498 for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
499 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
500 self.assertTrue(s.islower())
501 self.assertFalse(s.isupper())
502 self.assertFalse(s.istitle())
503 for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
504 'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
505 self.assertFalse(s.islower())
506 self.assertTrue(s.isupper())
507 self.assertTrue(s.istitle())
508
509 for meth_name in ('islower', 'isupper', 'istitle'):
510 meth = getattr(str, meth_name)
511 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
512 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
513
514 for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
515 'isdecimal', 'isnumeric',
516 'isidentifier', 'isprintable'):
517 meth = getattr(str, meth_name)
518 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
519 'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
520 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
521 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
522
523
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300524 def test_lower(self):
525 string_tests.CommonTest.test_lower(self)
526 self.assertEqual('\U00010427'.lower(), '\U0001044F')
527 self.assertEqual('\U00010427\U00010427'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300528 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300529 self.assertEqual('\U00010427\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300530 '\U0001044F\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300531 self.assertEqual('X\U00010427x\U0001044F'.lower(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300532 'x\U0001044Fx\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300533
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300534 def test_upper(self):
535 string_tests.CommonTest.test_upper(self)
536 self.assertEqual('\U0001044F'.upper(), '\U00010427')
537 self.assertEqual('\U0001044F\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300538 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300539 self.assertEqual('\U00010427\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300540 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300541 self.assertEqual('X\U00010427x\U0001044F'.upper(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300542 'X\U00010427X\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300543
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300544 def test_capitalize(self):
545 string_tests.CommonTest.test_capitalize(self)
546 self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
547 self.assertEqual('\U0001044F\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300548 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300549 self.assertEqual('\U00010427\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300550 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300551 self.assertEqual('\U0001044F\U00010427'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300552 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300553 self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300554 'X\U0001044Fx\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300555
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300556 def test_title(self):
557 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
558 self.assertEqual('\U0001044F'.title(), '\U00010427')
559 self.assertEqual('\U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300560 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300561 self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300562 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300563 self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300564 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300565 self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300566 '\U00010427\U0001044F \U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300567 self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300568 'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300569
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300570 def test_swapcase(self):
571 string_tests.CommonTest.test_swapcase(self)
572 self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
573 self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
574 self.assertEqual('\U0001044F\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300575 '\U00010427\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300576 self.assertEqual('\U00010427\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300577 '\U0001044F\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300578 self.assertEqual('\U0001044F\U00010427'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300579 '\U00010427\U0001044F')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300580 self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
Ezio Melottia5c92b42011-08-23 00:37:08 +0300581 'x\U0001044FX\U00010427')
Ezio Melotti93e7afc2011-08-22 14:08:38 +0300582
Walter Dörwald28256f22003-01-19 16:59:20 +0000583 def test_contains(self):
584 # Testing Unicode contains method
Benjamin Peterson577473f2010-01-19 00:09:57 +0000585 self.assertIn('a', 'abdb')
586 self.assertIn('a', 'bdab')
587 self.assertIn('a', 'bdaba')
588 self.assertIn('a', 'bdba')
589 self.assertNotIn('a', 'bdb')
590 self.assertIn('a', 'bdba')
591 self.assertIn('a', ('a',1,None))
592 self.assertIn('a', (1,None,'a'))
593 self.assertIn('a', ('a',1,None))
594 self.assertIn('a', (1,None,'a'))
595 self.assertNotIn('a', ('x',1,'y'))
596 self.assertNotIn('a', ('x',1,None))
597 self.assertNotIn('abcd', 'abcxxxx')
598 self.assertIn('ab', 'abcd')
599 self.assertIn('ab', 'abc')
600 self.assertIn('ab', (1,None,'ab'))
601 self.assertIn('', 'abc')
602 self.assertIn('', '')
603 self.assertIn('', 'abc')
604 self.assertNotIn('\0', 'abc')
605 self.assertIn('\0', '\0abc')
606 self.assertIn('\0', 'abc\0')
607 self.assertIn('a', '\0abc')
608 self.assertIn('asdf', 'asdf')
609 self.assertNotIn('asdf', 'asd')
610 self.assertNotIn('asdf', '')
Walter Dörwald28256f22003-01-19 16:59:20 +0000611
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000612 self.assertRaises(TypeError, "abc".__contains__)
Walter Dörwald28256f22003-01-19 16:59:20 +0000613
Eric Smith8c663262007-08-25 02:26:07 +0000614 def test_format(self):
615 self.assertEqual(''.format(), '')
616 self.assertEqual('a'.format(), 'a')
617 self.assertEqual('ab'.format(), 'ab')
618 self.assertEqual('a{{'.format(), 'a{')
619 self.assertEqual('a}}'.format(), 'a}')
620 self.assertEqual('{{b'.format(), '{b')
621 self.assertEqual('}}b'.format(), '}b')
622 self.assertEqual('a{{b'.format(), 'a{b')
623
624 # examples from the PEP:
625 import datetime
626 self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
627 self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
628 "My name is Fred")
629 self.assertEqual("My name is {0} :-{{}}".format('Fred'),
630 "My name is Fred :-{}")
631
632 d = datetime.date(2007, 8, 18)
633 self.assertEqual("The year is {0.year}".format(d),
634 "The year is 2007")
635
Eric Smith8c663262007-08-25 02:26:07 +0000636 # classes we'll use for testing
637 class C:
638 def __init__(self, x=100):
639 self._x = x
640 def __format__(self, spec):
641 return spec
642
643 class D:
644 def __init__(self, x):
645 self.x = x
646 def __format__(self, spec):
647 return str(self.x)
648
649 # class with __str__, but no __format__
650 class E:
651 def __init__(self, x):
652 self.x = x
653 def __str__(self):
654 return 'E(' + self.x + ')'
655
656 # class with __repr__, but no __format__ or __str__
657 class F:
658 def __init__(self, x):
659 self.x = x
660 def __repr__(self):
661 return 'F(' + self.x + ')'
662
663 # class with __format__ that forwards to string, for some format_spec's
664 class G:
665 def __init__(self, x):
666 self.x = x
667 def __str__(self):
668 return "string is " + self.x
669 def __format__(self, format_spec):
670 if format_spec == 'd':
671 return 'G(' + self.x + ')'
672 return object.__format__(self, format_spec)
673
Eric Smith739e2ad2007-08-27 19:07:22 +0000674 class I(datetime.date):
675 def __format__(self, format_spec):
676 return self.strftime(format_spec)
677
Eric Smith185e30c2007-08-30 22:23:08 +0000678 class J(int):
679 def __format__(self, format_spec):
680 return int.__format__(self * 2, format_spec)
681
Eric Smith8c663262007-08-25 02:26:07 +0000682
683 self.assertEqual(''.format(), '')
684 self.assertEqual('abc'.format(), 'abc')
685 self.assertEqual('{0}'.format('abc'), 'abc')
686 self.assertEqual('{0:}'.format('abc'), 'abc')
687# self.assertEqual('{ 0 }'.format('abc'), 'abc')
688 self.assertEqual('X{0}'.format('abc'), 'Xabc')
689 self.assertEqual('{0}X'.format('abc'), 'abcX')
690 self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
691 self.assertEqual('{1}'.format(1, 'abc'), 'abc')
692 self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
693 self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
694 self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
695 self.assertEqual('{0}'.format(-15), '-15')
696 self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
697 self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
698 self.assertEqual('{{'.format(), '{')
699 self.assertEqual('}}'.format(), '}')
700 self.assertEqual('{{}}'.format(), '{}')
701 self.assertEqual('{{x}}'.format(), '{x}')
702 self.assertEqual('{{{0}}}'.format(123), '{123}')
703 self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
704 self.assertEqual('}}{{'.format(), '}{')
705 self.assertEqual('}}x{{'.format(), '}x{')
706
Eric Smith7ade6482007-08-26 22:27:13 +0000707 # weird field names
708 self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
709 self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
Eric Smith4cb4e4e2007-09-03 08:40:29 +0000710 self.assertEqual("{0[ ]}".format({' ':3}), '3')
Eric Smith7ade6482007-08-26 22:27:13 +0000711
Eric Smith8c663262007-08-25 02:26:07 +0000712 self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
713 self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
714 self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
715 self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
716 self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
717 self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
718 self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
719
Eric Smith8c663262007-08-25 02:26:07 +0000720 # strings
721 self.assertEqual('{0:.3s}'.format('abc'), 'abc')
722 self.assertEqual('{0:.3s}'.format('ab'), 'ab')
723 self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
724 self.assertEqual('{0:.0s}'.format('abcdef'), '')
725 self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
726 self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
727 self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
728 self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
729 self.assertEqual('{0:x<0s}'.format('result'), 'result')
730 self.assertEqual('{0:x<5s}'.format('result'), 'result')
731 self.assertEqual('{0:x<6s}'.format('result'), 'result')
732 self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
733 self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
734 self.assertEqual('{0: <7s}'.format('result'), 'result ')
735 self.assertEqual('{0:<7s}'.format('result'), 'result ')
736 self.assertEqual('{0:>7s}'.format('result'), ' result')
737 self.assertEqual('{0:>8s}'.format('result'), ' result')
738 self.assertEqual('{0:^8s}'.format('result'), ' result ')
739 self.assertEqual('{0:^9s}'.format('result'), ' result ')
740 self.assertEqual('{0:^10s}'.format('result'), ' result ')
741 self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
742 self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
743 self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
744
745 # format specifiers for user defined type
746 self.assertEqual('{0:abc}'.format(C()), 'abc')
747
Georg Brandld52429f2008-07-04 15:55:02 +0000748 # !r, !s and !a coercions
Eric Smith8c663262007-08-25 02:26:07 +0000749 self.assertEqual('{0!s}'.format('Hello'), 'Hello')
750 self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
751 self.assertEqual('{0!s:15}'.format('Hello'), 'Hello ')
752 self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello ')
753 self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
754 self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
755 self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +0000756 self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +0000757 self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'") # printable
758 self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
Georg Brandl559e5d72008-06-11 18:37:52 +0000759 self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
Amaury Forgeot d'Arca083f1e2008-09-10 23:51:42 +0000760 self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
Georg Brandld52429f2008-07-04 15:55:02 +0000761 self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
Georg Brandl559e5d72008-06-11 18:37:52 +0000762 self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
763 self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
Georg Brandld52429f2008-07-04 15:55:02 +0000764 self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
Eric Smith8c663262007-08-25 02:26:07 +0000765
Eric Smith8c663262007-08-25 02:26:07 +0000766 # test fallback to object.__format__
767 self.assertEqual('{0}'.format({}), '{}')
768 self.assertEqual('{0}'.format([]), '[]')
769 self.assertEqual('{0}'.format([1]), '[1]')
Eric Smithe4d63172010-09-13 20:48:43 +0000770
Eric Smith8c663262007-08-25 02:26:07 +0000771 self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
Eric Smith8c663262007-08-25 02:26:07 +0000772 self.assertEqual('{0!s}'.format(G('data')), 'string is data')
773
Eric Smithe4d63172010-09-13 20:48:43 +0000774 msg = 'object.__format__ with a non-empty format string is deprecated'
Eric V. Smithb9cd3532011-03-12 10:08:48 -0500775 with support.check_warnings((msg, DeprecationWarning)):
Eric Smithe4d63172010-09-13 20:48:43 +0000776 self.assertEqual('{0:^10}'.format(E('data')), ' E(data) ')
777 self.assertEqual('{0:^10s}'.format(E('data')), ' E(data) ')
778 self.assertEqual('{0:>15s}'.format(G('data')), ' string is data')
779
Eric Smith739e2ad2007-08-27 19:07:22 +0000780 self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
781 month=8,
782 day=27)),
783 "date: 2007-08-27")
784
Eric Smith185e30c2007-08-30 22:23:08 +0000785 # test deriving from a builtin type and overriding __format__
786 self.assertEqual("{0}".format(J(10)), "20")
787
788
Eric Smith8c663262007-08-25 02:26:07 +0000789 # string format specifiers
790 self.assertEqual('{0:}'.format('a'), 'a')
791
792 # computed format specifiers
793 self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
794 self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
795 self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
796 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello ')
797 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello ')
798
799 # test various errors
800 self.assertRaises(ValueError, '{'.format)
801 self.assertRaises(ValueError, '}'.format)
802 self.assertRaises(ValueError, 'a{'.format)
803 self.assertRaises(ValueError, 'a}'.format)
804 self.assertRaises(ValueError, '{a'.format)
805 self.assertRaises(ValueError, '}a'.format)
Eric Smith11529192007-09-04 23:04:22 +0000806 self.assertRaises(IndexError, '{0}'.format)
807 self.assertRaises(IndexError, '{1}'.format, 'abc')
808 self.assertRaises(KeyError, '{x}'.format)
Eric Smith8c663262007-08-25 02:26:07 +0000809 self.assertRaises(ValueError, "}{".format)
Eric Smith8c663262007-08-25 02:26:07 +0000810 self.assertRaises(ValueError, "abc{0:{}".format)
811 self.assertRaises(ValueError, "{0".format)
Eric Smith11529192007-09-04 23:04:22 +0000812 self.assertRaises(IndexError, "{0.}".format)
813 self.assertRaises(ValueError, "{0.}".format, 0)
814 self.assertRaises(IndexError, "{0[}".format)
Eric Smith4cb4e4e2007-09-03 08:40:29 +0000815 self.assertRaises(ValueError, "{0[}".format, [])
Eric Smith11529192007-09-04 23:04:22 +0000816 self.assertRaises(KeyError, "{0]}".format)
817 self.assertRaises(ValueError, "{0.[]}".format, 0)
Eric Smith7ade6482007-08-26 22:27:13 +0000818 self.assertRaises(ValueError, "{0..foo}".format, 0)
Eric Smith11529192007-09-04 23:04:22 +0000819 self.assertRaises(ValueError, "{0[0}".format, 0)
820 self.assertRaises(ValueError, "{0[0:foo}".format, 0)
821 self.assertRaises(KeyError, "{c]}".format)
822 self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
823 self.assertRaises(ValueError, "{0}}".format, 0)
824 self.assertRaises(KeyError, "{foo}".format, bar=3)
Eric Smith8c663262007-08-25 02:26:07 +0000825 self.assertRaises(ValueError, "{0!x}".format, 3)
Eric Smith11529192007-09-04 23:04:22 +0000826 self.assertRaises(ValueError, "{0!}".format, 0)
827 self.assertRaises(ValueError, "{0!rs}".format, 0)
Eric Smith8c663262007-08-25 02:26:07 +0000828 self.assertRaises(ValueError, "{!}".format)
Eric Smith8ec90442009-03-14 12:29:34 +0000829 self.assertRaises(IndexError, "{:}".format)
830 self.assertRaises(IndexError, "{:s}".format)
831 self.assertRaises(IndexError, "{}".format)
Benjamin Peterson59a1b2f2010-06-07 22:31:26 +0000832 big = "23098475029384702983476098230754973209482573"
833 self.assertRaises(ValueError, ("{" + big + "}").format)
834 self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
Eric Smith8c663262007-08-25 02:26:07 +0000835
Eric Smith41669ca2009-05-23 14:23:22 +0000836 # issue 6089
837 self.assertRaises(ValueError, "{0[0]x}".format, [None])
838 self.assertRaises(ValueError, "{0[0](10)}".format, [None])
839
Eric Smith8c663262007-08-25 02:26:07 +0000840 # can't have a replacement on the field name portion
841 self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
842
843 # exceed maximum recursion depth
844 self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
845 self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
846 0, 1, 2, 3, 4, 5, 6, 7)
847
848 # string format spec errors
849 self.assertRaises(ValueError, "{0:-s}".format, '')
850 self.assertRaises(ValueError, format, "", "-")
851 self.assertRaises(ValueError, "{0:=s}".format, '')
852
Eric Smithb1ebcc62008-07-15 13:02:41 +0000853 # Alternate formatting is not supported
854 self.assertRaises(ValueError, format, '', '#')
855 self.assertRaises(ValueError, format, '', '#20')
856
Eric Smith27bbca62010-11-04 17:06:58 +0000857 def test_format_map(self):
858 self.assertEqual(''.format_map({}), '')
859 self.assertEqual('a'.format_map({}), 'a')
860 self.assertEqual('ab'.format_map({}), 'ab')
861 self.assertEqual('a{{'.format_map({}), 'a{')
862 self.assertEqual('a}}'.format_map({}), 'a}')
863 self.assertEqual('{{b'.format_map({}), '{b')
864 self.assertEqual('}}b'.format_map({}), '}b')
865 self.assertEqual('a{{b'.format_map({}), 'a{b')
866
867 # using mappings
868 class Mapping(dict):
869 def __missing__(self, key):
870 return key
871 self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
872 self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
873
874 class InternalMapping:
875 def __init__(self):
876 self.mapping = {'a': 'hello'}
877 def __getitem__(self, key):
878 return self.mapping[key]
879 self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
880
881
Eric Smith27bbca62010-11-04 17:06:58 +0000882 class C:
883 def __init__(self, x=100):
884 self._x = x
885 def __format__(self, spec):
886 return spec
Eric Smith27bbca62010-11-04 17:06:58 +0000887 self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
888
889 # test various errors
890 self.assertRaises(TypeError, '{'.format_map)
891 self.assertRaises(TypeError, '}'.format_map)
892 self.assertRaises(TypeError, 'a{'.format_map)
893 self.assertRaises(TypeError, 'a}'.format_map)
894 self.assertRaises(TypeError, '{a'.format_map)
895 self.assertRaises(TypeError, '}a'.format_map)
896
Eric V. Smith12ebefc2011-07-18 14:03:41 -0400897 # issue #12579: can't supply positional params to format_map
898 self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
899 self.assertRaises(ValueError, '{}'.format_map, 'a')
900 self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
901
Eric Smith8ec90442009-03-14 12:29:34 +0000902 def test_format_auto_numbering(self):
903 class C:
904 def __init__(self, x=100):
905 self._x = x
906 def __format__(self, spec):
907 return spec
908
909 self.assertEqual('{}'.format(10), '10')
910 self.assertEqual('{:5}'.format('s'), 's ')
911 self.assertEqual('{!r}'.format('s'), "'s'")
912 self.assertEqual('{._x}'.format(C(10)), '10')
913 self.assertEqual('{[1]}'.format([1, 2]), '2')
914 self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
915 self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
916
917 self.assertEqual('a{:{}}b'.format('x', '^10'), 'a x b')
918 self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
919
920 # can't mix and match numbering and auto-numbering
921 self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
922 self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
923 self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
924 self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
925
926 # can mix and match auto-numbering and named
927 self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
928 self.assertEqual('{}{f}'.format(4, f='test'), '4test')
929 self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
930 self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
931
Walter Dörwald28256f22003-01-19 16:59:20 +0000932 def test_formatting(self):
Walter Dörwald0fd583c2003-02-21 12:53:50 +0000933 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
Walter Dörwald28256f22003-01-19 16:59:20 +0000934 # Testing Unicode formatting strings...
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000935 self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
936 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000, 3.00')
937 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000, 3.00')
938 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000, 3.50')
939 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000, 3.57')
940 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
Walter Dörwald28256f22003-01-19 16:59:20 +0000941 if not sys.platform.startswith('java'):
Walter Dörwald67e83882007-05-05 12:26:27 +0000942 self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
Georg Brandl559e5d72008-06-11 18:37:52 +0000943 self.assertEqual("%r" % ("\u1234",), "'\u1234'")
944 self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000945 self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
946 self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
Walter Dörwald56fbcb52003-03-31 18:18:41 +0000947
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000948 self.assertEqual('%c' % 0x1234, '\u1234')
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +0000949 self.assertEqual('%c' % 0x21483, '\U00021483')
950 self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
951 self.assertEqual('%c' % '\U00021483', '\U00021483')
952 self.assertRaises(TypeError, "%c".__mod__, "aa")
Stefan Krah99212f62010-07-19 17:58:26 +0000953 self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -0700954 self.assertRaises(TypeError, "%i".__mod__, "aa")
Walter Dörwald28256f22003-01-19 16:59:20 +0000955
956 # formatting jobs delegated from the string implementation:
Walter Dörwald28256f22003-01-19 16:59:20 +0000957 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000958 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
959 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
960 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
961 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
962 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
963 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
964 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
965 self.assertEqual('...%s...' % "abc", '...abc...')
966 self.assertEqual('%*s' % (5,'abc',), ' abc')
967 self.assertEqual('%*s' % (-5,'abc',), 'abc ')
968 self.assertEqual('%*.*s' % (5,2,'abc',), ' ab')
969 self.assertEqual('%*.*s' % (5,3,'abc',), ' abc')
970 self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10 abc')
971 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103 abc')
972 self.assertEqual('%c' % 'a', 'a')
Neil Schemenauercf52c072005-08-12 17:34:58 +0000973 class Wrapper:
974 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000975 return '\u1234'
976 self.assertEqual('%s' % Wrapper(), '\u1234')
Walter Dörwald28256f22003-01-19 16:59:20 +0000977
Eric Smith741191f2009-05-06 13:08:15 +0000978 # issue 3382
979 NAN = float('nan')
980 INF = float('inf')
981 self.assertEqual('%f' % NAN, 'nan')
982 self.assertEqual('%F' % NAN, 'NAN')
983 self.assertEqual('%f' % INF, 'inf')
984 self.assertEqual('%F' % INF, 'INF')
985
Ezio Melottiba42fd52011-04-26 06:09:45 +0300986 def test_startswith_endswith_errors(self):
987 for meth in ('foo'.startswith, 'foo'.endswith):
Ezio Melottif2b3f782011-04-26 06:40:59 +0300988 with self.assertRaises(TypeError) as cm:
Ezio Melottiba42fd52011-04-26 06:09:45 +0300989 meth(['f'])
Ezio Melottif2b3f782011-04-26 06:40:59 +0300990 exc = str(cm.exception)
Ezio Melottiba42fd52011-04-26 06:09:45 +0300991 self.assertIn('str', exc)
992 self.assertIn('tuple', exc)
993
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000994 @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
Georg Brandlda6b1072006-01-20 17:48:54 +0000995 def test_format_float(self):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000996 # should not format with a comma, but always with C locale
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000997 self.assertEqual('1.0', '%.1f' % 1.0)
Georg Brandlda6b1072006-01-20 17:48:54 +0000998
Walter Dörwald28256f22003-01-19 16:59:20 +0000999 def test_constructor(self):
1000 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1001
1002 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001003 str('unicode remains unicode'),
1004 'unicode remains unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001005 )
1006
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001007 class UnicodeSubclass(str):
Marc-André Lemburg79f57832002-12-29 19:44:06 +00001008 pass
Guido van Rossuma831cac2000-03-10 23:23:21 +00001009
Victor Stinner07ac3eb2011-10-01 16:16:43 +02001010 for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
1011 subclass = UnicodeSubclass(text)
1012 self.assertEqual(str(subclass), text)
1013 self.assertEqual(len(subclass), len(text))
1014 if text == 'ascii':
1015 self.assertEqual(subclass.encode('ascii'), b'ascii')
1016 self.assertEqual(subclass.encode('utf-8'), b'ascii')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001017
Walter Dörwald28256f22003-01-19 16:59:20 +00001018 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001019 str('strings are converted to unicode'),
1020 'strings are converted to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001021 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001022
Walter Dörwald28256f22003-01-19 16:59:20 +00001023 class StringCompat:
1024 def __init__(self, x):
1025 self.x = x
1026 def __str__(self):
1027 return self.x
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001028
Walter Dörwald28256f22003-01-19 16:59:20 +00001029 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001030 str(StringCompat('__str__ compatible objects are recognized')),
1031 '__str__ compatible objects are recognized'
Walter Dörwald28256f22003-01-19 16:59:20 +00001032 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001033
Walter Dörwald28256f22003-01-19 16:59:20 +00001034 # unicode(obj) is compatible to str():
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001035
Walter Dörwald28256f22003-01-19 16:59:20 +00001036 o = StringCompat('unicode(obj) is compatible to str()')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001037 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Walter Dörwald28256f22003-01-19 16:59:20 +00001038 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001039
Guido van Rossume2a383d2007-01-15 16:59:06 +00001040 for obj in (123, 123.45, 123):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001041 self.assertEqual(str(obj), str(str(obj)))
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001042
Walter Dörwald28256f22003-01-19 16:59:20 +00001043 # unicode(obj, encoding, error) tests (this maps to
1044 # PyUnicode_FromEncodedObject() at C level)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001045
Walter Dörwald28256f22003-01-19 16:59:20 +00001046 if not sys.platform.startswith('java'):
1047 self.assertRaises(
1048 TypeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001049 str,
1050 'decoding unicode is not supported',
Walter Dörwald28256f22003-01-19 16:59:20 +00001051 'utf-8',
1052 'strict'
1053 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001054
Walter Dörwald28256f22003-01-19 16:59:20 +00001055 self.assertEqual(
Walter Dörwald67e83882007-05-05 12:26:27 +00001056 str(b'strings are decoded to unicode', 'utf-8', 'strict'),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001057 'strings are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001058 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001059
Walter Dörwald28256f22003-01-19 16:59:20 +00001060 if not sys.platform.startswith('java'):
1061 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001062 str(
Guido van Rossumbae07c92007-10-08 02:46:15 +00001063 memoryview(b'character buffers are decoded to unicode'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001064 'utf-8',
1065 'strict'
1066 ),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001067 'character buffers are decoded to unicode'
Walter Dörwald28256f22003-01-19 16:59:20 +00001068 )
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001069
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001070 self.assertRaises(TypeError, str, 42, 42, 42)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001071
Walter Dörwald28256f22003-01-19 16:59:20 +00001072 def test_codecs_utf7(self):
1073 utfTests = [
Walter Dörwald67e83882007-05-05 12:26:27 +00001074 ('A\u2262\u0391.', b'A+ImIDkQ.'), # RFC2152 example
1075 ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'), # RFC2152 example
1076 ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'), # RFC2152 example
1077 ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1078 ('+', b'+-'),
1079 ('+-', b'+--'),
1080 ('+?', b'+-?'),
1081 ('\?', b'+AFw?'),
1082 ('+?', b'+-?'),
1083 (r'\\?', b'+AFwAXA?'),
1084 (r'\\\?', b'+AFwAXABc?'),
Antoine Pitrou244651a2009-05-04 18:56:13 +00001085 (r'++--', b'+-+---'),
1086 ('\U000abcde', b'+2m/c3g-'), # surrogate pairs
1087 ('/', b'/'),
Walter Dörwald28256f22003-01-19 16:59:20 +00001088 ]
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001089
Walter Dörwald28256f22003-01-19 16:59:20 +00001090 for (x, y) in utfTests:
1091 self.assertEqual(x.encode('utf-7'), y)
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001092
Antoine Pitrou244651a2009-05-04 18:56:13 +00001093 # Unpaired surrogates not supported
Walter Dörwald67e83882007-05-05 12:26:27 +00001094 self.assertRaises(UnicodeError, str, b'+3ADYAA-', 'utf-7')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001095
Antoine Pitrou244651a2009-05-04 18:56:13 +00001096 self.assertEqual(str(b'+3ADYAA-', 'utf-7', 'replace'), '\ufffd\ufffd')
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001097
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001098 # Issue #2242: crash on some Windows/MSVC versions
Antoine Pitrou244651a2009-05-04 18:56:13 +00001099 self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1')
1100
1101 # Direct encoded characters
1102 set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1103 # Optional direct characters
1104 set_o = '!"#$%&*;<=>@[]^_`{|}'
1105 for c in set_d:
1106 self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1107 self.assertEqual(c.encode('ascii').decode('utf7'), c)
1108 for c in set_o:
1109 self.assertEqual(c.encode('ascii').decode('utf7'), c)
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001110
Walter Dörwald28256f22003-01-19 16:59:20 +00001111 def test_codecs_utf8(self):
Walter Dörwald67e83882007-05-05 12:26:27 +00001112 self.assertEqual(''.encode('utf-8'), b'')
1113 self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
Ezio Melottia9860ae2011-10-04 19:06:00 +03001114 self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82')
1115 self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96')
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001116 self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1117 self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
Ezio Melottia9860ae2011-10-04 19:06:00 +03001118 self.assertEqual(('\U00010002'*10).encode('utf-8'),
1119 b'\xf0\x90\x80\x82'*10)
Walter Dörwald28256f22003-01-19 16:59:20 +00001120 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001121 '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1122 '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1123 '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1124 '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1125 '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1126 ' Nunstuck git und'.encode('utf-8'),
Walter Dörwald67e83882007-05-05 12:26:27 +00001127 b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1128 b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1129 b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1130 b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1131 b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1132 b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1133 b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1134 b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1135 b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1136 b'\xe3\x80\x8cWenn ist das Nunstuck git und'
Walter Dörwald28256f22003-01-19 16:59:20 +00001137 )
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001138
Walter Dörwald28256f22003-01-19 16:59:20 +00001139 # UTF-8 specific decoding tests
Walter Dörwald67e83882007-05-05 12:26:27 +00001140 self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1141 self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1142 self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001143
Walter Dörwald28256f22003-01-19 16:59:20 +00001144 # Other possible utf-8 test cases:
1145 # * strict decoding testing for all of the
1146 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001147
Ezio Melotti57221d02010-07-01 07:32:02 +00001148 def test_utf8_decode_valid_sequences(self):
1149 sequences = [
1150 # single byte
1151 (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1152 # 2 bytes
1153 (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1154 # 3 bytes
1155 (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1156 (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1157 # 4 bytes
1158 (b'\xF0\x90\x80\x80', '\U00010000'),
1159 (b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1160 ]
1161 for seq, res in sequences:
1162 self.assertEqual(seq.decode('utf-8'), res)
1163
1164
1165 def test_utf8_decode_invalid_sequences(self):
1166 # continuation bytes in a sequence of 2, 3, or 4 bytes
1167 continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
1168 # start bytes of a 2-byte sequence equivalent to codepoints < 0x7F
1169 invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
1170 # start bytes of a 4-byte sequence equivalent to codepoints > 0x10FFFF
1171 invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1172 invalid_start_bytes = (
1173 continuation_bytes + invalid_2B_seq_start_bytes +
1174 invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1175 )
1176
1177 for byte in invalid_start_bytes:
1178 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1179
1180 for sb in invalid_2B_seq_start_bytes:
1181 for cb in continuation_bytes:
1182 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1183
1184 for sb in invalid_4B_seq_start_bytes:
1185 for cb1 in continuation_bytes[:3]:
1186 for cb3 in continuation_bytes[:3]:
1187 self.assertRaises(UnicodeDecodeError,
1188 (sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1189
1190 for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1191 self.assertRaises(UnicodeDecodeError,
1192 (b'\xE0'+cb+b'\x80').decode, 'utf-8')
1193 self.assertRaises(UnicodeDecodeError,
1194 (b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1195 # surrogates
1196 for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1197 self.assertRaises(UnicodeDecodeError,
1198 (b'\xED'+cb+b'\x80').decode, 'utf-8')
1199 self.assertRaises(UnicodeDecodeError,
1200 (b'\xED'+cb+b'\xBF').decode, 'utf-8')
1201 for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1202 self.assertRaises(UnicodeDecodeError,
1203 (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1204 self.assertRaises(UnicodeDecodeError,
1205 (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1206 for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1207 self.assertRaises(UnicodeDecodeError,
1208 (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1209 self.assertRaises(UnicodeDecodeError,
1210 (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1211
1212 def test_issue8271(self):
1213 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1214 # only the start byte and the continuation byte(s) are now considered
1215 # invalid, instead of the number of bytes specified by the start byte.
1216 # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1217 # table 3-8, Row 2) for more information about the algorithm used.
1218 FFFD = '\ufffd'
1219 sequences = [
1220 # invalid start bytes
1221 (b'\x80', FFFD), # continuation byte
1222 (b'\x80\x80', FFFD*2), # 2 continuation bytes
1223 (b'\xc0', FFFD),
1224 (b'\xc0\xc0', FFFD*2),
1225 (b'\xc1', FFFD),
1226 (b'\xc1\xc0', FFFD*2),
1227 (b'\xc0\xc1', FFFD*2),
1228 # with start byte of a 2-byte sequence
1229 (b'\xc2', FFFD), # only the start byte
1230 (b'\xc2\xc2', FFFD*2), # 2 start bytes
1231 (b'\xc2\xc2\xc2', FFFD*3), # 2 start bytes
1232 (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1233 # with start byte of a 3-byte sequence
1234 (b'\xe1', FFFD), # only the start byte
1235 (b'\xe1\xe1', FFFD*2), # 2 start bytes
1236 (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1237 (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1238 (b'\xe1\x80', FFFD), # only 1 continuation byte
1239 (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1240 (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1241 (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1242 (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1243 (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1244 (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1245 # with start byte of a 4-byte sequence
1246 (b'\xf1', FFFD), # only the start byte
1247 (b'\xf1\xf1', FFFD*2), # 2 start bytes
1248 (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1249 (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1250 (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1251 (b'\xf1\x80', FFFD), # only 1 continuation bytes
1252 (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1253 (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1254 (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1255 (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1256 (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1257 (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1258 (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1259 (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1260 (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1261 (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1262 (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1263 (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1264 # with invalid start byte of a 4-byte sequence (rfc2279)
1265 (b'\xf5', FFFD), # only the start byte
1266 (b'\xf5\xf5', FFFD*2), # 2 start bytes
1267 (b'\xf5\x80', FFFD*2), # only 1 continuation byte
1268 (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1269 (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1270 (b'\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
1271 (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1272 (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1273 # with invalid start byte of a 5-byte sequence (rfc2279)
1274 (b'\xf8', FFFD), # only the start byte
1275 (b'\xf8\xf8', FFFD*2), # 2 start bytes
1276 (b'\xf8\x80', FFFD*2), # only one continuation byte
1277 (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1278 (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1279 # with invalid start byte of a 6-byte sequence (rfc2279)
1280 (b'\xfc', FFFD), # only the start byte
1281 (b'\xfc\xfc', FFFD*2), # 2 start bytes
1282 (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1283 (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1284 # invalid start byte
1285 (b'\xfe', FFFD),
1286 (b'\xfe\x80\x80', FFFD*3),
1287 # other sequences
1288 (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1289 (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1290 (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1291 (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1292 '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1293 ]
1294 for n, (seq, res) in enumerate(sequences):
1295 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1296 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1297 self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1298 self.assertEqual(seq.decode('utf-8', 'ignore'),
1299 res.replace('\uFFFD', ''))
1300
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00001301 def test_codecs_idna(self):
1302 # Test whether trailing dot is preserved
Walter Dörwald1324c6f2007-05-11 19:57:05 +00001303 self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +00001304
Walter Dörwald28256f22003-01-19 16:59:20 +00001305 def test_codecs_errors(self):
1306 # Error handling (encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001307 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
1308 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
Walter Dörwald67e83882007-05-05 12:26:27 +00001309 self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
1310 self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
Benjamin Peterson308d6372009-09-18 21:42:35 +00001311 self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
1312 'Andr\202 x'.encode('ascii', errors='replace'))
1313 self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
1314 'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001315
Walter Dörwald28256f22003-01-19 16:59:20 +00001316 # Error handling (decoding)
Walter Dörwald67e83882007-05-05 12:26:27 +00001317 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
1318 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
1319 self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
1320 self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001321
Walter Dörwald28256f22003-01-19 16:59:20 +00001322 # Error handling (unknown character names)
Guido van Rossum39478e82007-08-27 17:23:59 +00001323 self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001324
Walter Dörwald28256f22003-01-19 16:59:20 +00001325 # Error handling (truncated escape sequence)
Guido van Rossum9c627722007-08-27 18:31:48 +00001326 self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00001327
Guido van Rossum9c627722007-08-27 18:31:48 +00001328 self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
1329 self.assertRaises(TypeError, str, b"hello", "test.unicode2")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001330 self.assertRaises(TypeError, "hello".encode, "test.unicode1")
1331 self.assertRaises(TypeError, "hello".encode, "test.unicode2")
Walter Dörwald28256f22003-01-19 16:59:20 +00001332 # executes PyUnicode_Encode()
1333 import imp
1334 self.assertRaises(
1335 ImportError,
1336 imp.find_module,
1337 "non-existing module",
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001338 ["non-existing dir"]
Walter Dörwald28256f22003-01-19 16:59:20 +00001339 )
Marc-André Lemburgd6d06ad2000-07-07 17:48:52 +00001340
Walter Dörwald28256f22003-01-19 16:59:20 +00001341 # Error handling (wrong arguments)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001342 self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001343
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001344 # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
1345 self.assertRaises(UnicodeError, int, "\ud800")
1346 self.assertRaises(UnicodeError, int, "\udf00")
1347 self.assertRaises(UnicodeError, float, "\ud800")
1348 self.assertRaises(UnicodeError, float, "\udf00")
1349 self.assertRaises(UnicodeError, complex, "\ud800")
1350 self.assertRaises(UnicodeError, complex, "\udf00")
Guido van Rossum97064862000-04-10 13:52:48 +00001351
Walter Dörwald28256f22003-01-19 16:59:20 +00001352 def test_codecs(self):
1353 # Encoding
Walter Dörwald67e83882007-05-05 12:26:27 +00001354 self.assertEqual('hello'.encode('ascii'), b'hello')
1355 self.assertEqual('hello'.encode('utf-7'), b'hello')
1356 self.assertEqual('hello'.encode('utf-8'), b'hello')
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001357 self.assertEqual('hello'.encode('utf-8'), b'hello')
Walter Dörwald67e83882007-05-05 12:26:27 +00001358 self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
1359 self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
1360 self.assertEqual('hello'.encode('latin-1'), b'hello')
Guido van Rossum97064862000-04-10 13:52:48 +00001361
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001362 # Default encoding is utf-8
1363 self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
1364
Walter Dörwald28256f22003-01-19 16:59:20 +00001365 # Roundtrip safety for BMP (just the first 1024 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00001366 for c in range(1024):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00001367 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001368 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
1369 'utf-16-be', 'raw_unicode_escape',
1370 'unicode_escape', 'unicode_internal'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001371 self.assertEqual(str(u.encode(encoding),encoding), u)
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001372
Walter Dörwald28256f22003-01-19 16:59:20 +00001373 # Roundtrip safety for BMP (just the first 256 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00001374 for c in range(256):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00001375 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001376 for encoding in ('latin-1',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001377 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001378
Walter Dörwald28256f22003-01-19 16:59:20 +00001379 # Roundtrip safety for BMP (just the first 128 chars)
Guido van Rossum805365e2007-05-07 22:24:25 +00001380 for c in range(128):
Guido van Rossum84fc66d2007-05-03 17:18:26 +00001381 u = chr(c)
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001382 for encoding in ('ascii',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001383 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001384
Walter Dörwald28256f22003-01-19 16:59:20 +00001385 # Roundtrip safety for non-BMP (just a few chars)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001386 u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
Walter Dörwald28256f22003-01-19 16:59:20 +00001387 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
1388 #'raw_unicode_escape',
1389 'unicode_escape', 'unicode_internal'):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001390 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossumd8855fd2000-03-24 22:14:19 +00001391
Walter Dörwald28256f22003-01-19 16:59:20 +00001392 # UTF-8 must be roundtrip safe for all UCS-2 code points
1393 # This excludes surrogates: in the full range, there would be
1394 # a surrogate pair (\udbff\udc00), which gets converted back
1395 # to a non-BMP character (\U0010fc00)
Walter Dörwald1324c6f2007-05-11 19:57:05 +00001396 u = ''.join(map(chr, list(range(0,0xd800)) +
1397 list(range(0xe000,0x10000))))
Walter Dörwald28256f22003-01-19 16:59:20 +00001398 for encoding in ('utf-8',):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001399 self.assertEqual(str(u.encode(encoding),encoding), u)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001400
Walter Dörwald28256f22003-01-19 16:59:20 +00001401 def test_codecs_charmap(self):
1402 # 0-127
Guido van Rossum805365e2007-05-07 22:24:25 +00001403 s = bytes(range(128))
Walter Dörwald28256f22003-01-19 16:59:20 +00001404 for encoding in (
1405 'cp037', 'cp1026',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001406 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1407 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Walter Dörwald28256f22003-01-19 16:59:20 +00001408 'cp863', 'cp865', 'cp866',
1409 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1410 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
1411 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
1412 'mac_cyrillic', 'mac_latin2',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001413
Walter Dörwald28256f22003-01-19 16:59:20 +00001414 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1415 'cp1256', 'cp1257', 'cp1258',
1416 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001417
Walter Dörwald28256f22003-01-19 16:59:20 +00001418 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
1419 'cp1006', 'iso8859_8',
Guido van Rossum9e896b32000-04-05 20:11:21 +00001420
Walter Dörwald28256f22003-01-19 16:59:20 +00001421 ### These have undefined mappings:
1422 #'cp424',
Guido van Rossum9e896b32000-04-05 20:11:21 +00001423
Walter Dörwald28256f22003-01-19 16:59:20 +00001424 ### These fail the round-trip:
1425 #'cp875'
Guido van Rossum9e896b32000-04-05 20:11:21 +00001426
Walter Dörwald28256f22003-01-19 16:59:20 +00001427 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001428 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001429
Walter Dörwald28256f22003-01-19 16:59:20 +00001430 # 128-255
Guido van Rossum805365e2007-05-07 22:24:25 +00001431 s = bytes(range(128, 256))
Walter Dörwald28256f22003-01-19 16:59:20 +00001432 for encoding in (
1433 'cp037', 'cp1026',
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001434 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1435 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
Walter Dörwald28256f22003-01-19 16:59:20 +00001436 'cp863', 'cp865', 'cp866',
1437 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1438 'iso8859_2', 'iso8859_4', 'iso8859_5',
1439 'iso8859_9', 'koi8_r', 'latin_1',
1440 'mac_cyrillic', 'mac_latin2',
Fred Drake004d5e62000-10-23 17:22:08 +00001441
Walter Dörwald28256f22003-01-19 16:59:20 +00001442 ### These have undefined mappings:
1443 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1444 #'cp1256', 'cp1257', 'cp1258',
1445 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
1446 #'iso8859_3', 'iso8859_6', 'iso8859_7',
1447 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
Fred Drake004d5e62000-10-23 17:22:08 +00001448
Walter Dörwald28256f22003-01-19 16:59:20 +00001449 ### These fail the round-trip:
1450 #'cp1006', 'cp875', 'iso8859_8',
Tim Peters2f228e72001-05-13 00:19:31 +00001451
Walter Dörwald28256f22003-01-19 16:59:20 +00001452 ):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001453 self.assertEqual(str(s, encoding).encode(encoding), s)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001454
Walter Dörwald28256f22003-01-19 16:59:20 +00001455 def test_concatenation(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001456 self.assertEqual(("abc" "def"), "abcdef")
1457 self.assertEqual(("abc" "def"), "abcdef")
1458 self.assertEqual(("abc" "def"), "abcdef")
1459 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
1460 self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
Fred Drake004d5e62000-10-23 17:22:08 +00001461
Walter Dörwald28256f22003-01-19 16:59:20 +00001462 def test_printing(self):
1463 class BitBucket:
1464 def write(self, text):
1465 pass
Fred Drake004d5e62000-10-23 17:22:08 +00001466
Walter Dörwald28256f22003-01-19 16:59:20 +00001467 out = BitBucket()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001468 print('abc', file=out)
1469 print('abc', 'def', file=out)
1470 print('abc', 'def', file=out)
1471 print('abc', 'def', file=out)
1472 print('abc\n', file=out)
1473 print('abc\n', end=' ', file=out)
1474 print('abc\n', end=' ', file=out)
1475 print('def\n', file=out)
1476 print('def\n', file=out)
Fred Drake004d5e62000-10-23 17:22:08 +00001477
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00001478 def test_ucs4(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001479 x = '\U00100000'
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00001480 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
1481 self.assertEqual(x, y)
1482
Florent Xiclunaa87b3832010-09-13 02:28:18 +00001483 y = br'\U00100000'
1484 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
1485 self.assertEqual(x, y)
1486 y = br'\U00010000'
1487 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
1488 self.assertEqual(x, y)
Christian Heimesfe337bf2008-03-23 21:54:12 +00001489
Florent Xiclunaa87b3832010-09-13 02:28:18 +00001490 try:
1491 br'\U11111111'.decode("raw-unicode-escape")
1492 except UnicodeDecodeError as e:
1493 self.assertEqual(e.start, 0)
1494 self.assertEqual(e.end, 10)
1495 else:
1496 self.fail("Should have raised UnicodeDecodeError")
Christian Heimesfe337bf2008-03-23 21:54:12 +00001497
Brett Cannonc3647ac2005-04-26 03:45:26 +00001498 def test_conversion(self):
1499 # Make sure __unicode__() works properly
1500 class Foo0:
1501 def __str__(self):
1502 return "foo"
1503
1504 class Foo1:
Guido van Rossum98297ee2007-11-06 21:34:58 +00001505 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001506 return "foo"
Brett Cannonc3647ac2005-04-26 03:45:26 +00001507
1508 class Foo2(object):
Guido van Rossum98297ee2007-11-06 21:34:58 +00001509 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001510 return "foo"
Brett Cannonc3647ac2005-04-26 03:45:26 +00001511
1512 class Foo3(object):
Guido van Rossum98297ee2007-11-06 21:34:58 +00001513 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001514 return "foo"
1515
1516 class Foo4(str):
Guido van Rossum98297ee2007-11-06 21:34:58 +00001517 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001518 return "foo"
1519
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001520 class Foo5(str):
Guido van Rossum98297ee2007-11-06 21:34:58 +00001521 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001522 return "foo"
1523
1524 class Foo6(str):
1525 def __str__(self):
1526 return "foos"
1527
Guido van Rossum98297ee2007-11-06 21:34:58 +00001528 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001529 return "foou"
Brett Cannonc3647ac2005-04-26 03:45:26 +00001530
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001531 class Foo7(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001532 def __str__(self):
1533 return "foos"
Guido van Rossum98297ee2007-11-06 21:34:58 +00001534 def __str__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001535 return "foou"
Brett Cannonc3647ac2005-04-26 03:45:26 +00001536
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001537 class Foo8(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001538 def __new__(cls, content=""):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001539 return str.__new__(cls, 2*content)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001540 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001541 return self
1542
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001543 class Foo9(str):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001544 def __str__(self):
Brett Cannonc3647ac2005-04-26 03:45:26 +00001545 return "not unicode"
1546
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001547 self.assertEqual(str(Foo0()), "foo")
1548 self.assertEqual(str(Foo1()), "foo")
1549 self.assertEqual(str(Foo2()), "foo")
1550 self.assertEqual(str(Foo3()), "foo")
1551 self.assertEqual(str(Foo4("bar")), "foo")
1552 self.assertEqual(str(Foo5("bar")), "foo")
1553 self.assertEqual(str(Foo6("bar")), "foou")
1554 self.assertEqual(str(Foo7("bar")), "foou")
1555 self.assertEqual(str(Foo8("foo")), "foofoo")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001556 self.assertEqual(str(Foo9("foo")), "not unicode")
Brett Cannonc3647ac2005-04-26 03:45:26 +00001557
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001558 def test_unicode_repr(self):
1559 class s1:
1560 def __repr__(self):
1561 return '\\n'
1562
1563 class s2:
1564 def __repr__(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001565 return '\\n'
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001566
1567 self.assertEqual(repr(s1()), '\\n')
1568 self.assertEqual(repr(s2()), '\\n')
1569
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001570 def test_printable_repr(self):
1571 self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001572 self.assertEqual(repr('\U00014000'), "'\\U00014000'") # nonprintable
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001573
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001574 def test_expandtabs_overflows_gracefully(self):
1575 # This test only affects 32-bit platforms because expandtabs can only take
1576 # an int as the max value, not a 64-bit C long. If expandtabs is changed
1577 # to take a 64-bit long, this test should apply to all platforms.
Christian Heimesa37d4c62007-12-04 23:02:19 +00001578 if sys.maxsize > (1 << 32) or struct.calcsize('P') != 4:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001579 return
Christian Heimesa37d4c62007-12-04 23:02:19 +00001580 self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001581
Antoine Pitroue19aa382011-10-04 16:04:01 +02001582 def test_expandtabs_optimization(self):
1583 s = 'abc'
1584 self.assertIs(s.expandtabs(), s)
1585
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00001586 def test_raiseMemError(self):
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001587 if struct.calcsize('P') == 8:
1588 # 64 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02001589 ascii_struct_size = 48
1590 compact_struct_size = 72
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001591 else:
1592 # 32 bits pointers
Martin v. Löwis287eca62011-09-28 10:03:28 +02001593 ascii_struct_size = 24
1594 compact_struct_size = 36
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001595
1596 for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
1597 code = ord(char)
1598 if code < 0x100:
1599 char_size = 1 # sizeof(Py_UCS1)
1600 struct_size = ascii_struct_size
1601 elif code < 0x10000:
1602 char_size = 2 # sizeof(Py_UCS2)
1603 struct_size = compact_struct_size
1604 else:
1605 char_size = 4 # sizeof(Py_UCS4)
1606 struct_size = compact_struct_size
1607 # Note: sys.maxsize is half of the actual max allocation because of
Martin v. Löwis287eca62011-09-28 10:03:28 +02001608 # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
1609 # be allocatable, given enough memory.
1610 maxlen = ((sys.maxsize - struct_size) // char_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001611 alloc = lambda: char * maxlen
1612 self.assertRaises(MemoryError, alloc)
1613 self.assertRaises(MemoryError, alloc)
Antoine Pitrou3db3e872008-08-17 17:06:51 +00001614
Victor Stinner808fc0a2010-03-22 12:50:40 +00001615 def test_format_subclass(self):
1616 class S(str):
1617 def __str__(self):
1618 return '__str__ overridden'
1619 s = S('xxx')
Florent Xiclunaa87b3832010-09-13 02:28:18 +00001620 self.assertEqual("%s" % s, '__str__ overridden')
1621 self.assertEqual("{}".format(s), '__str__ overridden')
Victor Stinner808fc0a2010-03-22 12:50:40 +00001622
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00001623 # Test PyUnicode_FromFormat()
Victor Stinner1205f272010-09-11 00:54:47 +00001624 def test_from_format(self):
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00001625 support.import_module('ctypes')
Victor Stinner6d970f42011-03-02 00:04:25 +00001626 from ctypes import (pythonapi, py_object,
1627 c_int, c_long, c_longlong, c_ssize_t,
1628 c_uint, c_ulong, c_ulonglong, c_size_t)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001629 name = "PyUnicode_FromFormat"
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00001630 _PyUnicode_FromFormat = getattr(pythonapi, name)
1631 _PyUnicode_FromFormat.restype = py_object
1632
1633 def PyUnicode_FromFormat(format, *args):
1634 cargs = tuple(
1635 py_object(arg) if isinstance(arg, str) else arg
1636 for arg in args)
1637 return _PyUnicode_FromFormat(format, *cargs)
Victor Stinner1205f272010-09-11 00:54:47 +00001638
1639 # ascii format, non-ascii argument
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00001640 text = PyUnicode_FromFormat(b'ascii\x7f=%U', 'unicode\xe9')
Victor Stinner1205f272010-09-11 00:54:47 +00001641 self.assertEqual(text, 'ascii\x7f=unicode\xe9')
1642
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00001643 # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
1644 # raises an error
Ezio Melottied3a7d22010-12-01 02:32:32 +00001645 self.assertRaisesRegex(ValueError,
Victor Stinner1205f272010-09-11 00:54:47 +00001646 '^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
Victor Stinner4c7db312010-09-12 07:51:18 +00001647 'string, got a non-ASCII byte: 0xe9$',
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00001648 PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +00001649
Victor Stinner96865452011-03-01 23:44:09 +00001650 # test "%c"
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001651 self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0xabcd)), '\uabcd')
1652 self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0x10ffff)), '\U0010ffff')
1653
Victor Stinner96865452011-03-01 23:44:09 +00001654 # test "%"
1655 self.assertEqual(PyUnicode_FromFormat(b'%'), '%')
1656 self.assertEqual(PyUnicode_FromFormat(b'%%'), '%')
1657 self.assertEqual(PyUnicode_FromFormat(b'%%s'), '%s')
1658 self.assertEqual(PyUnicode_FromFormat(b'[%%]'), '[%]')
1659 self.assertEqual(PyUnicode_FromFormat(b'%%%s', b'abc'), '%abc')
1660
Victor Stinner6d970f42011-03-02 00:04:25 +00001661 # test integer formats (%i, %d, %u)
Victor Stinner96865452011-03-01 23:44:09 +00001662 self.assertEqual(PyUnicode_FromFormat(b'%03i', c_int(10)), '010')
1663 self.assertEqual(PyUnicode_FromFormat(b'%0.4i', c_int(10)), '0010')
Victor Stinner6d970f42011-03-02 00:04:25 +00001664 self.assertEqual(PyUnicode_FromFormat(b'%i', c_int(-123)), '-123')
1665 self.assertEqual(PyUnicode_FromFormat(b'%li', c_long(-123)), '-123')
1666 self.assertEqual(PyUnicode_FromFormat(b'%lli', c_longlong(-123)), '-123')
1667 self.assertEqual(PyUnicode_FromFormat(b'%zi', c_ssize_t(-123)), '-123')
Victor Stinner96865452011-03-01 23:44:09 +00001668
Victor Stinner6d970f42011-03-02 00:04:25 +00001669 self.assertEqual(PyUnicode_FromFormat(b'%d', c_int(-123)), '-123')
1670 self.assertEqual(PyUnicode_FromFormat(b'%ld', c_long(-123)), '-123')
1671 self.assertEqual(PyUnicode_FromFormat(b'%lld', c_longlong(-123)), '-123')
1672 self.assertEqual(PyUnicode_FromFormat(b'%zd', c_ssize_t(-123)), '-123')
Victor Stinner96865452011-03-01 23:44:09 +00001673
Victor Stinner6d970f42011-03-02 00:04:25 +00001674 self.assertEqual(PyUnicode_FromFormat(b'%u', c_uint(123)), '123')
1675 self.assertEqual(PyUnicode_FromFormat(b'%lu', c_ulong(123)), '123')
1676 self.assertEqual(PyUnicode_FromFormat(b'%llu', c_ulonglong(123)), '123')
1677 self.assertEqual(PyUnicode_FromFormat(b'%zu', c_size_t(123)), '123')
1678
1679 # test %A
Victor Stinnerca1e7ec2011-01-05 00:19:28 +00001680 text = PyUnicode_FromFormat(b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
Victor Stinner9a909002010-10-18 20:59:24 +00001681 self.assertEqual(text, r"%A:'abc\xe9\uabcd\U0010ffff'")
1682
Victor Stinner6d970f42011-03-02 00:04:25 +00001683 # test %V
Victor Stinner2512a8b2011-03-01 22:46:52 +00001684 text = PyUnicode_FromFormat(b'repr=%V', 'abc', b'xyz')
1685 self.assertEqual(text, 'repr=abc')
1686
1687 # Test string decode from parameter of %s using utf-8.
1688 # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
1689 # '\u4eba\u6c11'
1690 text = PyUnicode_FromFormat(b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
1691 self.assertEqual(text, 'repr=\u4eba\u6c11')
1692
1693 #Test replace error handler.
1694 text = PyUnicode_FromFormat(b'repr=%V', None, b'abc\xff')
1695 self.assertEqual(text, 'repr=abc\ufffd')
1696
Victor Stinner6d970f42011-03-02 00:04:25 +00001697 # not supported: copy the raw format string. these tests are just here
1698 # to check for crashs and should not be considered as specifications
1699 self.assertEqual(PyUnicode_FromFormat(b'%1%s', b'abc'), '%s')
1700 self.assertEqual(PyUnicode_FromFormat(b'%1abc'), '%1abc')
1701 self.assertEqual(PyUnicode_FromFormat(b'%+i', c_int(10)), '%+i')
1702 self.assertEqual(PyUnicode_FromFormat(b'%.%s', b'abc'), '%.%s')
1703
Victor Stinner1c24bd02010-10-02 11:03:13 +00001704 # Test PyUnicode_AsWideChar()
1705 def test_aswidechar(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001706 from _testcapi import unicode_aswidechar
Antoine Pitrou0662bc22010-11-22 16:19:04 +00001707 support.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001708 from ctypes import c_wchar, sizeof
1709
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001710 wchar, size = unicode_aswidechar('abcdef', 2)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001711 self.assertEqual(size, 2)
1712 self.assertEqual(wchar, 'ab')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001713
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001714 wchar, size = unicode_aswidechar('abc', 3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001715 self.assertEqual(size, 3)
1716 self.assertEqual(wchar, 'abc')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001717
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001718 wchar, size = unicode_aswidechar('abc', 4)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001719 self.assertEqual(size, 3)
1720 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001721
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001722 wchar, size = unicode_aswidechar('abc', 10)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001723 self.assertEqual(size, 3)
1724 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001725
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001726 wchar, size = unicode_aswidechar('abc\0def', 20)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001727 self.assertEqual(size, 7)
1728 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001729
Victor Stinner5593d8a2010-10-02 11:11:27 +00001730 nonbmp = chr(0x10ffff)
1731 if sizeof(c_wchar) == 2:
1732 buflen = 3
1733 nchar = 2
1734 else: # sizeof(c_wchar) == 4
1735 buflen = 2
1736 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001737 wchar, size = unicode_aswidechar(nonbmp, buflen)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001738 self.assertEqual(size, nchar)
1739 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00001740
Victor Stinner1c24bd02010-10-02 11:03:13 +00001741 # Test PyUnicode_AsWideCharString()
1742 def test_aswidecharstring(self):
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001743 from _testcapi import unicode_aswidecharstring
Antoine Pitrou0662bc22010-11-22 16:19:04 +00001744 support.import_module('ctypes')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001745 from ctypes import c_wchar, sizeof
1746
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001747 wchar, size = unicode_aswidecharstring('abc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001748 self.assertEqual(size, 3)
1749 self.assertEqual(wchar, 'abc\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001750
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001751 wchar, size = unicode_aswidecharstring('abc\0def')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001752 self.assertEqual(size, 7)
1753 self.assertEqual(wchar, 'abc\0def\0')
Victor Stinner1c24bd02010-10-02 11:03:13 +00001754
Victor Stinner5593d8a2010-10-02 11:11:27 +00001755 nonbmp = chr(0x10ffff)
1756 if sizeof(c_wchar) == 2:
1757 nchar = 2
1758 else: # sizeof(c_wchar) == 4
1759 nchar = 1
Victor Stinner46c7b3b2010-10-02 11:49:31 +00001760 wchar, size = unicode_aswidecharstring(nonbmp)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001761 self.assertEqual(size, nchar)
1762 self.assertEqual(wchar, nonbmp + '\0')
Victor Stinner5593d8a2010-10-02 11:11:27 +00001763
Benjamin Peterson811c2f12011-09-30 21:31:21 -04001764 def test_subclass_add(self):
1765 class S(str):
1766 def __add__(self, o):
1767 return "3"
1768 self.assertEqual(S("4") + S("5"), "3")
1769 class S(str):
1770 def __iadd__(self, o):
1771 return "3"
1772 s = S("1")
1773 s += "4"
1774 self.assertEqual(s, "3")
1775
Victor Stinner1c24bd02010-10-02 11:03:13 +00001776
Eric Smitha1eac722011-01-29 11:15:35 +00001777class StringModuleTest(unittest.TestCase):
1778 def test_formatter_parser(self):
1779 def parse(format):
1780 return list(_string.formatter_parser(format))
1781
1782 formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
1783 self.assertEqual(formatter, [
1784 ('prefix ', '2', '', 's'),
1785 ('xxx', '0', '^+10.3f', None),
1786 ('', 'obj.attr', '', 's'),
1787 (' ', 'z[0]', '10', 's'),
1788 ])
1789
1790 formatter = parse("prefix {} suffix")
1791 self.assertEqual(formatter, [
1792 ('prefix ', '', '', None),
1793 (' suffix', None, None, None),
1794 ])
1795
1796 formatter = parse("str")
1797 self.assertEqual(formatter, [
1798 ('str', None, None, None),
1799 ])
1800
1801 formatter = parse("")
1802 self.assertEqual(formatter, [])
1803
1804 formatter = parse("{0}")
1805 self.assertEqual(formatter, [
1806 ('', '0', '', None),
1807 ])
1808
1809 self.assertRaises(TypeError, _string.formatter_parser, 1)
1810
1811 def test_formatter_field_name_split(self):
1812 def split(name):
1813 items = list(_string.formatter_field_name_split(name))
1814 items[1] = list(items[1])
1815 return items
1816 self.assertEqual(split("obj"), ["obj", []])
1817 self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
1818 self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
1819 self.assertEqual(split("obj.arg[key1][key2]"), [
1820 "obj",
1821 [(True, 'arg'),
1822 (False, 'key1'),
1823 (False, 'key2'),
1824 ]])
1825 self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
1826
1827
Walter Dörwald28256f22003-01-19 16:59:20 +00001828def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001829 support.run_unittest(__name__)
Barry Warsaw817918c2002-08-06 16:58:21 +00001830
Walter Dörwald28256f22003-01-19 16:59:20 +00001831if __name__ == "__main__":
1832 test_main()