blob: b52e1f6d1cfbe613819237ad338a579759dc94ad [file] [log] [blame]
Victor Stinner040e16e2011-11-15 22:44:05 +01001import codecs
2import html.entities
3import sys
4import test.support
5import unicodedata
6import unittest
7import warnings
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00009class PosReturn:
10 # this can be used for configurable callbacks
11
12 def __init__(self):
13 self.pos = 0
14
15 def handle(self, exc):
16 oldpos = self.pos
17 realpos = oldpos
18 if realpos<0:
Tim Petersf2715e02003-02-19 02:35:07 +000019 realpos = len(exc.object) + realpos
Walter Dörwald2e0b18a2003-01-31 17:19:08 +000020 # if we don't advance this time, terminate on the next call
21 # otherwise we'd get an endless loop
22 if realpos <= exc.start:
23 self.pos = len(exc.object)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000024 return ("<?>", oldpos)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +000025
Walter Dörwald690402f2005-11-17 18:51:34 +000026# A UnicodeEncodeError object with a bad start attribute
27class BadStartUnicodeEncodeError(UnicodeEncodeError):
28 def __init__(self):
Walter Dörwaldd2034312007-05-18 16:29:38 +000029 UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
Walter Dörwald690402f2005-11-17 18:51:34 +000030 self.start = []
31
Walter Dörwald690402f2005-11-17 18:51:34 +000032# A UnicodeEncodeError object with a bad object attribute
33class BadObjectUnicodeEncodeError(UnicodeEncodeError):
34 def __init__(self):
Walter Dörwaldd2034312007-05-18 16:29:38 +000035 UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
Walter Dörwald690402f2005-11-17 18:51:34 +000036 self.object = []
37
38# A UnicodeDecodeError object without an end attribute
39class NoEndUnicodeDecodeError(UnicodeDecodeError):
40 def __init__(self):
Guido van Rossum254348e2007-11-21 19:29:53 +000041 UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad")
Walter Dörwald690402f2005-11-17 18:51:34 +000042 del self.end
43
44# A UnicodeDecodeError object with a bad object attribute
45class BadObjectUnicodeDecodeError(UnicodeDecodeError):
46 def __init__(self):
Guido van Rossum254348e2007-11-21 19:29:53 +000047 UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad")
Walter Dörwald690402f2005-11-17 18:51:34 +000048 self.object = []
49
50# A UnicodeTranslateError object without a start attribute
51class NoStartUnicodeTranslateError(UnicodeTranslateError):
52 def __init__(self):
Walter Dörwaldd2034312007-05-18 16:29:38 +000053 UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
Walter Dörwald690402f2005-11-17 18:51:34 +000054 del self.start
55
56# A UnicodeTranslateError object without an end attribute
57class NoEndUnicodeTranslateError(UnicodeTranslateError):
58 def __init__(self):
Walter Dörwaldd2034312007-05-18 16:29:38 +000059 UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
Walter Dörwald690402f2005-11-17 18:51:34 +000060 del self.end
61
62# A UnicodeTranslateError object without an object attribute
63class NoObjectUnicodeTranslateError(UnicodeTranslateError):
64 def __init__(self):
Walter Dörwaldd2034312007-05-18 16:29:38 +000065 UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
Walter Dörwald690402f2005-11-17 18:51:34 +000066 del self.object
67
Walter Dörwald3aeb6322002-09-02 13:14:32 +000068class CodecCallbackTest(unittest.TestCase):
69
70 def test_xmlcharrefreplace(self):
71 # replace unencodable characters which numeric character entities.
72 # For ascii, latin-1 and charmaps this is completely implemented
73 # in C and should be reasonably fast.
Guido van Rossumef87d6e2007-05-02 19:09:54 +000074 s = "\u30b9\u30d1\u30e2 \xe4nd eggs"
Walter Dörwald3aeb6322002-09-02 13:14:32 +000075 self.assertEqual(
76 s.encode("ascii", "xmlcharrefreplace"),
Walter Dörwald00048f02007-05-09 10:44:06 +000077 b"&#12473;&#12497;&#12514; &#228;nd eggs"
Walter Dörwald3aeb6322002-09-02 13:14:32 +000078 )
79 self.assertEqual(
80 s.encode("latin-1", "xmlcharrefreplace"),
Walter Dörwald00048f02007-05-09 10:44:06 +000081 b"&#12473;&#12497;&#12514; \xe4nd eggs"
Walter Dörwald3aeb6322002-09-02 13:14:32 +000082 )
83
84 def test_xmlcharnamereplace(self):
85 # This time use a named character entity for unencodable
86 # characters, if one is available.
Walter Dörwald3aeb6322002-09-02 13:14:32 +000087
88 def xmlcharnamereplace(exc):
89 if not isinstance(exc, UnicodeEncodeError):
90 raise TypeError("don't know how to handle %r" % exc)
91 l = []
92 for c in exc.object[exc.start:exc.end]:
93 try:
Fred Drake3c50ea42008-05-17 22:02:32 +000094 l.append("&%s;" % html.entities.codepoint2name[ord(c)])
Walter Dörwald3aeb6322002-09-02 13:14:32 +000095 except KeyError:
Guido van Rossumef87d6e2007-05-02 19:09:54 +000096 l.append("&#%d;" % ord(c))
97 return ("".join(l), exc.end)
Walter Dörwald3aeb6322002-09-02 13:14:32 +000098
99 codecs.register_error(
100 "test.xmlcharnamereplace", xmlcharnamereplace)
101
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000102 sin = "\xab\u211c\xbb = \u2329\u1234\u20ac\u232a"
Walter Dörwald00048f02007-05-09 10:44:06 +0000103 sout = b"&laquo;&real;&raquo; = &lang;&#4660;&euro;&rang;"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000104 self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout)
Walter Dörwald00048f02007-05-09 10:44:06 +0000105 sout = b"\xab&real;\xbb = &lang;&#4660;&euro;&rang;"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000106 self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout)
Walter Dörwald00048f02007-05-09 10:44:06 +0000107 sout = b"\xab&real;\xbb = &lang;&#4660;\xa4&rang;"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000108 self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout)
109
110 def test_uninamereplace(self):
111 # We're using the names from the unicode database this time,
Walter Dörwald00445d22002-11-25 17:58:02 +0000112 # and we're doing "syntax highlighting" here, i.e. we include
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000113 # the replaced text in ANSI escape sequences. For this it is
114 # useful that the error handler is not called for every single
115 # unencodable character, but for a complete sequence of
116 # unencodable characters, otherwise we would output many
Mark Dickinson934896d2009-02-21 20:59:32 +0000117 # unnecessary escape sequences.
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000118
119 def uninamereplace(exc):
120 if not isinstance(exc, UnicodeEncodeError):
121 raise TypeError("don't know how to handle %r" % exc)
122 l = []
123 for c in exc.object[exc.start:exc.end]:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000124 l.append(unicodedata.name(c, "0x%x" % ord(c)))
125 return ("\033[1m%s\033[0m" % ", ".join(l), exc.end)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000126
127 codecs.register_error(
128 "test.uninamereplace", uninamereplace)
129
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000130 sin = "\xac\u1234\u20ac\u8000"
Walter Dörwald00048f02007-05-09 10:44:06 +0000131 sout = b"\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000132 self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout)
133
Walter Dörwald00048f02007-05-09 10:44:06 +0000134 sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000135 self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout)
136
Walter Dörwald00048f02007-05-09 10:44:06 +0000137 sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000138 self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout)
139
140 def test_backslashescape(self):
141 # Does the same as the "unicode-escape" encoding, but with different
142 # base encodings.
Ezio Melottia9860ae2011-10-04 19:06:00 +0300143 sin = "a\xac\u1234\u20ac\u8000\U0010ffff"
144 sout = b"a\\xac\\u1234\\u20ac\\u8000\\U0010ffff"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000145 self.assertEqual(sin.encode("ascii", "backslashreplace"), sout)
146
Ezio Melottia9860ae2011-10-04 19:06:00 +0300147 sout = b"a\xac\\u1234\\u20ac\\u8000\\U0010ffff"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000148 self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout)
149
Ezio Melottia9860ae2011-10-04 19:06:00 +0300150 sout = b"a\xac\\u1234\xa4\\u8000\\U0010ffff"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000151 self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
152
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200153 def test_nameescape(self):
154 # Does the same as backslashescape, but prefers ``\N{...}`` escape
155 # sequences.
156 sin = "a\xac\u1234\u20ac\u8000\U0010ffff"
157 sout = (b'a\\N{NOT SIGN}\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
158 b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
159 self.assertEqual(sin.encode("ascii", "namereplace"), sout)
160
161 sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
162 b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
163 self.assertEqual(sin.encode("latin-1", "namereplace"), sout)
164
165 sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\xa4'
166 b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
167 self.assertEqual(sin.encode("iso-8859-15", "namereplace"), sout)
168
Ezio Melotti57221d02010-07-01 07:32:02 +0000169 def test_decoding_callbacks(self):
170 # This is a test for a decoding callback handler
171 # that allows the decoding of the invalid sequence
172 # "\xc0\x80" and returns "\x00" instead of raising an error.
173 # All other illegal sequences will be handled strictly.
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000174 def relaxedutf8(exc):
175 if not isinstance(exc, UnicodeDecodeError):
176 raise TypeError("don't know how to handle %r" % exc)
Ezio Melotti57221d02010-07-01 07:32:02 +0000177 if exc.object[exc.start:exc.start+2] == b"\xc0\x80":
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000178 return ("\x00", exc.start+2) # retry after two bytes
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000179 else:
180 raise exc
181
Ezio Melotti57221d02010-07-01 07:32:02 +0000182 codecs.register_error("test.relaxedutf8", relaxedutf8)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000183
Ezio Melotti57221d02010-07-01 07:32:02 +0000184 # all the "\xc0\x80" will be decoded to "\x00"
Walter Dörwald00048f02007-05-09 10:44:06 +0000185 sin = b"a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80"
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000186 sout = "a\x00b\x00c\xfc\x00\x00"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000187 self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout)
Ezio Melotti57221d02010-07-01 07:32:02 +0000188
189 # "\xc0\x81" is not valid and a UnicodeDecodeError will be raised
Walter Dörwald00048f02007-05-09 10:44:06 +0000190 sin = b"\xc0\x80\xc0\x81"
Ezio Melotti57221d02010-07-01 07:32:02 +0000191 self.assertRaises(UnicodeDecodeError, sin.decode,
192 "utf-8", "test.relaxedutf8")
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000193
194 def test_charmapencode(self):
195 # For charmap encodings the replacement string will be
196 # mapped through the encoding again. This means, that
197 # to be able to use e.g. the "replace" handler, the
198 # charmap has to have a mapping for "?".
Guido van Rossum98297ee2007-11-06 21:34:58 +0000199 charmap = dict((ord(c), bytes(2*c.upper(), 'ascii')) for c in "abcdefgh")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000200 sin = "abc"
Walter Dörwald00048f02007-05-09 10:44:06 +0000201 sout = b"AABBCC"
Ezio Melottib3aedd42010-11-20 19:04:17 +0000202 self.assertEqual(codecs.charmap_encode(sin, "strict", charmap)[0], sout)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000203
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000204 sin = "abcA"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000205 self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap)
206
Guido van Rossum98297ee2007-11-06 21:34:58 +0000207 charmap[ord("?")] = b"XYZ"
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000208 sin = "abcDEF"
Walter Dörwald00048f02007-05-09 10:44:06 +0000209 sout = b"AABBCCXYZXYZXYZ"
Ezio Melottib3aedd42010-11-20 19:04:17 +0000210 self.assertEqual(codecs.charmap_encode(sin, "replace", charmap)[0], sout)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000211
Walter Dörwald00048f02007-05-09 10:44:06 +0000212 charmap[ord("?")] = "XYZ" # wrong type in mapping
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000213 self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
214
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000215 def test_decodeunicodeinternal(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +0200216 with test.support.check_warnings(('unicode_internal codec has been '
217 'deprecated', DeprecationWarning)):
218 self.assertRaises(
219 UnicodeDecodeError,
220 b"\x00\x00\x00\x00\x00".decode,
221 "unicode-internal",
222 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200223 if len('\0'.encode('unicode-internal')) == 4:
224 def handler_unicodeinternal(exc):
225 if not isinstance(exc, UnicodeDecodeError):
226 raise TypeError("don't know how to handle %r" % exc)
227 return ("\x01", 1)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000228
Ezio Melottiadc417c2011-11-17 12:23:34 +0200229 self.assertEqual(
230 b"\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"),
231 "\u0000"
232 )
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000233
Ezio Melottiadc417c2011-11-17 12:23:34 +0200234 self.assertEqual(
235 b"\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"),
236 "\u0000\ufffd"
237 )
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000238
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200239 self.assertEqual(
240 b"\x00\x00\x00\x00\x00".decode("unicode-internal", "backslashreplace"),
241 "\u0000\\x00"
242 )
243
Ezio Melottiadc417c2011-11-17 12:23:34 +0200244 codecs.register_error("test.hui", handler_unicodeinternal)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000245
Ezio Melottiadc417c2011-11-17 12:23:34 +0200246 self.assertEqual(
247 b"\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"),
248 "\u0000\u0001\u0000"
249 )
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000250
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000251 def test_callbacks(self):
252 def handler1(exc):
Walter Dörwald00048f02007-05-09 10:44:06 +0000253 r = range(exc.start, exc.end)
254 if isinstance(exc, UnicodeEncodeError):
255 l = ["<%d>" % ord(exc.object[pos]) for pos in r]
256 elif isinstance(exc, UnicodeDecodeError):
257 l = ["<%d>" % exc.object[pos] for pos in r]
258 else:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000259 raise TypeError("don't know how to handle %r" % exc)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000260 return ("[%s]" % "".join(l), exc.end)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000261
262 codecs.register_error("test.handler1", handler1)
263
264 def handler2(exc):
265 if not isinstance(exc, UnicodeDecodeError):
266 raise TypeError("don't know how to handle %r" % exc)
Walter Dörwald00048f02007-05-09 10:44:06 +0000267 l = ["<%d>" % exc.object[pos] for pos in range(exc.start, exc.end)]
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000268 return ("[%s]" % "".join(l), exc.end+1) # skip one character
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000269
270 codecs.register_error("test.handler2", handler2)
271
Walter Dörwald00048f02007-05-09 10:44:06 +0000272 s = b"\x00\x81\x7f\x80\xff"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000273
274 self.assertEqual(
275 s.decode("ascii", "test.handler1"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000276 "\x00[<129>]\x7f[<128>][<255>]"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000277 )
278 self.assertEqual(
279 s.decode("ascii", "test.handler2"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000280 "\x00[<129>][<128>]"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000281 )
282
283 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000284 b"\\u3042\u3xxx".decode("unicode-escape", "test.handler1"),
Serhiy Storchakad6793772013-01-29 10:20:44 +0200285 "\u3042[<92><117><51>]xxx"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000286 )
287
288 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000289 b"\\u3042\u3xx".decode("unicode-escape", "test.handler1"),
Serhiy Storchakad6793772013-01-29 10:20:44 +0200290 "\u3042[<92><117><51>]xx"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000291 )
292
293 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000294 codecs.charmap_decode(b"abc", "test.handler1", {ord("a"): "z"})[0],
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000295 "z[<98>][<99>]"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000296 )
297
298 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000299 "g\xfc\xdfrk".encode("ascii", "test.handler1"),
Walter Dörwald00048f02007-05-09 10:44:06 +0000300 b"g[<252><223>]rk"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000301 )
302
303 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000304 "g\xfc\xdf".encode("ascii", "test.handler1"),
Walter Dörwald00048f02007-05-09 10:44:06 +0000305 b"g[<252><223>]"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000306 )
307
308 def test_longstrings(self):
309 # test long strings to check for memory overflow problems
Walter Dörwald41980ca2007-08-16 21:55:45 +0000310 errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200311 "backslashreplace", "namereplace"]
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000312 # register the handlers under different names,
313 # to prevent the codec from recognizing the name
314 for err in errors:
315 codecs.register_error("test." + err, codecs.lookup_error(err))
316 l = 1000
317 errors += [ "test." + err for err in errors ]
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000318 for uni in [ s*l for s in ("x", "\u3042", "a\xe4") ]:
Walter Dörwald41980ca2007-08-16 21:55:45 +0000319 for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
320 "utf-8", "utf-7", "utf-16", "utf-32"):
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000321 for err in errors:
Tim Peters3de75262002-11-09 05:26:15 +0000322 try:
323 uni.encode(enc, err)
324 except UnicodeError:
325 pass
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000326
327 def check_exceptionobjectargs(self, exctype, args, msg):
328 # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion
329 # check with one missing argument
330 self.assertRaises(TypeError, exctype, *args[:-1])
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000331 # check with one argument too much
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000332 self.assertRaises(TypeError, exctype, *(args + ["too much"]))
333 # check with one argument of the wrong type
Guido van Rossum98297ee2007-11-06 21:34:58 +0000334 wrongargs = [ "spam", b"eggs", b"spam", 42, 1.0, None ]
Guido van Rossum805365e2007-05-07 22:24:25 +0000335 for i in range(len(args)):
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000336 for wrongarg in wrongargs:
337 if type(wrongarg) is type(args[i]):
Tim Peters3de75262002-11-09 05:26:15 +0000338 continue
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000339 # build argument array
340 callargs = []
Guido van Rossum805365e2007-05-07 22:24:25 +0000341 for j in range(len(args)):
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000342 if i==j:
343 callargs.append(wrongarg)
344 else:
345 callargs.append(args[i])
346 self.assertRaises(TypeError, exctype, *callargs)
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000347
348 # check with the correct number and type of arguments
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000349 exc = exctype(*args)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000350 self.assertEqual(str(exc), msg)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000351
352 def test_unicodeencodeerror(self):
353 self.check_exceptionobjectargs(
354 UnicodeEncodeError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000355 ["ascii", "g\xfcrk", 1, 2, "ouch"],
Walter Dörwald32a4c712007-06-20 09:25:34 +0000356 "'ascii' codec can't encode character '\\xfc' in position 1: ouch"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000357 )
358 self.check_exceptionobjectargs(
359 UnicodeEncodeError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000360 ["ascii", "g\xfcrk", 1, 4, "ouch"],
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000361 "'ascii' codec can't encode characters in position 1-3: ouch"
362 )
363 self.check_exceptionobjectargs(
364 UnicodeEncodeError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000365 ["ascii", "\xfcx", 0, 1, "ouch"],
Walter Dörwald32a4c712007-06-20 09:25:34 +0000366 "'ascii' codec can't encode character '\\xfc' in position 0: ouch"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000367 )
Walter Dörwaldfd196bd2003-08-12 17:32:43 +0000368 self.check_exceptionobjectargs(
369 UnicodeEncodeError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000370 ["ascii", "\u0100x", 0, 1, "ouch"],
Walter Dörwald32a4c712007-06-20 09:25:34 +0000371 "'ascii' codec can't encode character '\\u0100' in position 0: ouch"
Walter Dörwaldfd196bd2003-08-12 17:32:43 +0000372 )
373 self.check_exceptionobjectargs(
374 UnicodeEncodeError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000375 ["ascii", "\uffffx", 0, 1, "ouch"],
Walter Dörwald32a4c712007-06-20 09:25:34 +0000376 "'ascii' codec can't encode character '\\uffff' in position 0: ouch"
Walter Dörwaldfd196bd2003-08-12 17:32:43 +0000377 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200378 self.check_exceptionobjectargs(
379 UnicodeEncodeError,
380 ["ascii", "\U00010000x", 0, 1, "ouch"],
381 "'ascii' codec can't encode character '\\U00010000' in position 0: ouch"
382 )
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000383
384 def test_unicodedecodeerror(self):
385 self.check_exceptionobjectargs(
386 UnicodeDecodeError,
Guido van Rossum254348e2007-11-21 19:29:53 +0000387 ["ascii", bytearray(b"g\xfcrk"), 1, 2, "ouch"],
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000388 "'ascii' codec can't decode byte 0xfc in position 1: ouch"
389 )
390 self.check_exceptionobjectargs(
391 UnicodeDecodeError,
Guido van Rossum254348e2007-11-21 19:29:53 +0000392 ["ascii", bytearray(b"g\xfcrk"), 1, 3, "ouch"],
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000393 "'ascii' codec can't decode bytes in position 1-2: ouch"
394 )
395
396 def test_unicodetranslateerror(self):
397 self.check_exceptionobjectargs(
398 UnicodeTranslateError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000399 ["g\xfcrk", 1, 2, "ouch"],
Walter Dörwald32a4c712007-06-20 09:25:34 +0000400 "can't translate character '\\xfc' in position 1: ouch"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000401 )
402 self.check_exceptionobjectargs(
403 UnicodeTranslateError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000404 ["g\u0100rk", 1, 2, "ouch"],
Walter Dörwald32a4c712007-06-20 09:25:34 +0000405 "can't translate character '\\u0100' in position 1: ouch"
Walter Dörwaldfd196bd2003-08-12 17:32:43 +0000406 )
407 self.check_exceptionobjectargs(
408 UnicodeTranslateError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000409 ["g\uffffrk", 1, 2, "ouch"],
Walter Dörwald32a4c712007-06-20 09:25:34 +0000410 "can't translate character '\\uffff' in position 1: ouch"
Walter Dörwaldfd196bd2003-08-12 17:32:43 +0000411 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200412 self.check_exceptionobjectargs(
413 UnicodeTranslateError,
414 ["g\U00010000rk", 1, 2, "ouch"],
415 "can't translate character '\\U00010000' in position 1: ouch"
416 )
Walter Dörwaldfd196bd2003-08-12 17:32:43 +0000417 self.check_exceptionobjectargs(
418 UnicodeTranslateError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000419 ["g\xfcrk", 1, 3, "ouch"],
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000420 "can't translate characters in position 1-2: ouch"
421 )
422
423 def test_badandgoodstrictexceptions(self):
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000424 # "strict" complains about a non-exception passed in
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000425 self.assertRaises(
426 TypeError,
427 codecs.strict_errors,
428 42
429 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000430 # "strict" complains about the wrong exception type
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000431 self.assertRaises(
432 Exception,
433 codecs.strict_errors,
434 Exception("ouch")
435 )
436
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000437 # If the correct exception is passed in, "strict" raises it
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000438 self.assertRaises(
439 UnicodeEncodeError,
440 codecs.strict_errors,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000441 UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000442 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200443 self.assertRaises(
444 UnicodeDecodeError,
445 codecs.strict_errors,
446 UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
447 )
448 self.assertRaises(
449 UnicodeTranslateError,
450 codecs.strict_errors,
451 UnicodeTranslateError("\u3042", 0, 1, "ouch")
452 )
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000453
454 def test_badandgoodignoreexceptions(self):
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000455 # "ignore" complains about a non-exception passed in
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000456 self.assertRaises(
457 TypeError,
458 codecs.ignore_errors,
459 42
460 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000461 # "ignore" complains about the wrong exception type
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000462 self.assertRaises(
463 TypeError,
464 codecs.ignore_errors,
465 UnicodeError("ouch")
466 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000467 # If the correct exception is passed in, "ignore" returns an empty replacement
Ezio Melottib3aedd42010-11-20 19:04:17 +0000468 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000469 codecs.ignore_errors(
Walter Dörwaldd2034312007-05-18 16:29:38 +0000470 UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000471 ("", 1)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000472 )
Ezio Melottib3aedd42010-11-20 19:04:17 +0000473 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000474 codecs.ignore_errors(
Guido van Rossum254348e2007-11-21 19:29:53 +0000475 UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000476 ("", 1)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000477 )
Ezio Melottib3aedd42010-11-20 19:04:17 +0000478 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000479 codecs.ignore_errors(
Walter Dörwaldd2034312007-05-18 16:29:38 +0000480 UnicodeTranslateError("\u3042", 0, 1, "ouch")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000481 ("", 1)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000482 )
483
484 def test_badandgoodreplaceexceptions(self):
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000485 # "replace" complains about a non-exception passed in
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000486 self.assertRaises(
487 TypeError,
488 codecs.replace_errors,
489 42
490 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000491 # "replace" complains about the wrong exception type
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000492 self.assertRaises(
493 TypeError,
494 codecs.replace_errors,
495 UnicodeError("ouch")
496 )
Walter Dörwald690402f2005-11-17 18:51:34 +0000497 self.assertRaises(
Walter Dörwald690402f2005-11-17 18:51:34 +0000498 TypeError,
499 codecs.replace_errors,
500 BadObjectUnicodeEncodeError()
501 )
502 self.assertRaises(
Walter Dörwald690402f2005-11-17 18:51:34 +0000503 TypeError,
504 codecs.replace_errors,
505 BadObjectUnicodeDecodeError()
506 )
Guido van Rossum805365e2007-05-07 22:24:25 +0000507 # With the correct exception, "replace" returns an "?" or "\ufffd" replacement
Ezio Melottib3aedd42010-11-20 19:04:17 +0000508 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000509 codecs.replace_errors(
Walter Dörwaldd2034312007-05-18 16:29:38 +0000510 UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000511 ("?", 1)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000512 )
Ezio Melottib3aedd42010-11-20 19:04:17 +0000513 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000514 codecs.replace_errors(
Guido van Rossum254348e2007-11-21 19:29:53 +0000515 UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000516 ("\ufffd", 1)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000517 )
Ezio Melottib3aedd42010-11-20 19:04:17 +0000518 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000519 codecs.replace_errors(
Walter Dörwaldd2034312007-05-18 16:29:38 +0000520 UnicodeTranslateError("\u3042", 0, 1, "ouch")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000521 ("\ufffd", 1)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000522 )
523
524 def test_badandgoodxmlcharrefreplaceexceptions(self):
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000525 # "xmlcharrefreplace" complains about a non-exception passed in
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000526 self.assertRaises(
527 TypeError,
528 codecs.xmlcharrefreplace_errors,
529 42
530 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000531 # "xmlcharrefreplace" complains about the wrong exception types
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000532 self.assertRaises(
533 TypeError,
534 codecs.xmlcharrefreplace_errors,
535 UnicodeError("ouch")
536 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000537 # "xmlcharrefreplace" can only be used for encoding
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000538 self.assertRaises(
539 TypeError,
540 codecs.xmlcharrefreplace_errors,
Guido van Rossum254348e2007-11-21 19:29:53 +0000541 UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000542 )
543 self.assertRaises(
544 TypeError,
545 codecs.xmlcharrefreplace_errors,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000546 UnicodeTranslateError("\u3042", 0, 1, "ouch")
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000547 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000548 # Use the correct exception
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200549 cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000, 99999, 100000,
550 999999, 1000000)
551 cs += (0xd800, 0xdfff)
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000552 s = "".join(chr(c) for c in cs)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000553 self.assertEqual(
Walter Dörwald690402f2005-11-17 18:51:34 +0000554 codecs.xmlcharrefreplace_errors(
Walter Dörwaldd2034312007-05-18 16:29:38 +0000555 UnicodeEncodeError("ascii", s, 0, len(s), "ouch")
Walter Dörwald690402f2005-11-17 18:51:34 +0000556 ),
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200557 ("".join("&#%d;" % c for c in cs), len(s))
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000558 )
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000559
560 def test_badandgoodbackslashreplaceexceptions(self):
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000561 # "backslashreplace" complains about a non-exception passed in
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000562 self.assertRaises(
563 TypeError,
564 codecs.backslashreplace_errors,
565 42
566 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000567 # "backslashreplace" complains about the wrong exception types
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000568 self.assertRaises(
569 TypeError,
570 codecs.backslashreplace_errors,
571 UnicodeError("ouch")
572 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000573 # Use the correct exception
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200574 tests = [
575 ("\u3042", "\\u3042"),
576 ("\n", "\\x0a"),
577 ("a", "\\x61"),
578 ("\x00", "\\x00"),
579 ("\xff", "\\xff"),
580 ("\u0100", "\\u0100"),
581 ("\uffff", "\\uffff"),
582 ("\U00010000", "\\U00010000"),
583 ("\U0010ffff", "\\U0010ffff"),
584 # Lone surrogates
585 ("\ud800", "\\ud800"),
586 ("\udfff", "\\udfff"),
587 ("\ud800\udfff", "\\ud800\\udfff"),
588 ]
589 for s, r in tests:
590 with self.subTest(str=s):
591 self.assertEqual(
592 codecs.backslashreplace_errors(
593 UnicodeEncodeError("ascii", s, 0, len(s), "ouch")),
594 (r, len(s))
595 )
Serhiy Storchaka93f4d4c2015-03-15 23:43:34 +0200596 self.assertEqual(
597 codecs.backslashreplace_errors(
598 UnicodeTranslateError(s, 0, len(s), "ouch")),
599 (r, len(s))
600 )
601 tests = [
602 (b"a", "\\x61"),
603 (b"\n", "\\x0a"),
604 (b"\x00", "\\x00"),
605 (b"\xff", "\\xff"),
606 ]
607 for b, r in tests:
608 with self.subTest(bytes=b):
609 self.assertEqual(
610 codecs.backslashreplace_errors(
611 UnicodeDecodeError("ascii", bytearray(b), 0, 1, "ouch")),
612 (r, 1)
613 )
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000614
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200615 def test_badandgoodnamereplaceexceptions(self):
616 # "namereplace" complains about a non-exception passed in
617 self.assertRaises(
618 TypeError,
619 codecs.namereplace_errors,
620 42
621 )
622 # "namereplace" complains about the wrong exception types
623 self.assertRaises(
624 TypeError,
625 codecs.namereplace_errors,
626 UnicodeError("ouch")
627 )
628 # "namereplace" can only be used for encoding
629 self.assertRaises(
630 TypeError,
631 codecs.namereplace_errors,
632 UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
633 )
634 self.assertRaises(
635 TypeError,
636 codecs.namereplace_errors,
637 UnicodeTranslateError("\u3042", 0, 1, "ouch")
638 )
639 # Use the correct exception
Serhiy Storchaka93f4d4c2015-03-15 23:43:34 +0200640 tests = [
641 ("\u3042", "\\N{HIRAGANA LETTER A}"),
642 ("\x00", "\\x00"),
643 ("\ufbf9", "\\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH "
644 "HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}"),
645 ("\U000e007f", "\\N{CANCEL TAG}"),
646 ("\U0010ffff", "\\U0010ffff"),
647 # Lone surrogates
648 ("\ud800", "\\ud800"),
649 ("\udfff", "\\udfff"),
650 ("\ud800\udfff", "\\ud800\\udfff"),
651 ]
652 for s, r in tests:
653 with self.subTest(str=s):
654 self.assertEqual(
655 codecs.namereplace_errors(
656 UnicodeEncodeError("ascii", s, 0, len(s), "ouch")),
657 (r, len(s))
658 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200659
660 def test_badandgoodsurrogateescapeexceptions(self):
661 surrogateescape_errors = codecs.lookup_error('surrogateescape')
662 # "surrogateescape" complains about a non-exception passed in
663 self.assertRaises(
664 TypeError,
665 surrogateescape_errors,
666 42
667 )
668 # "surrogateescape" complains about the wrong exception types
669 self.assertRaises(
670 TypeError,
671 surrogateescape_errors,
672 UnicodeError("ouch")
673 )
674 # "surrogateescape" can not be used for translating
675 self.assertRaises(
676 TypeError,
677 surrogateescape_errors,
678 UnicodeTranslateError("\udc80", 0, 1, "ouch")
679 )
680 # Use the correct exception
681 for s in ("a", "\udc7f", "\udd00"):
682 with self.subTest(str=s):
683 self.assertRaises(
684 UnicodeEncodeError,
685 surrogateescape_errors,
686 UnicodeEncodeError("ascii", s, 0, 1, "ouch")
687 )
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200688 self.assertEqual(
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200689 surrogateescape_errors(
690 UnicodeEncodeError("ascii", "\udc80", 0, 1, "ouch")),
691 (b"\x80", 1)
692 )
693 self.assertRaises(
694 UnicodeDecodeError,
695 surrogateescape_errors,
696 UnicodeDecodeError("ascii", bytearray(b"a"), 0, 1, "ouch")
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200697 )
698 self.assertEqual(
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200699 surrogateescape_errors(
700 UnicodeDecodeError("ascii", bytearray(b"\x80"), 0, 1, "ouch")),
701 ("\udc80", 1)
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200702 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200703
704 def test_badandgoodsurrogatepassexceptions(self):
705 surrogatepass_errors = codecs.lookup_error('surrogatepass')
706 # "surrogatepass" complains about a non-exception passed in
707 self.assertRaises(
708 TypeError,
709 surrogatepass_errors,
710 42
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200711 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200712 # "surrogatepass" complains about the wrong exception types
713 self.assertRaises(
714 TypeError,
715 surrogatepass_errors,
716 UnicodeError("ouch")
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200717 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200718 # "surrogatepass" can not be used for translating
719 self.assertRaises(
720 TypeError,
721 surrogatepass_errors,
722 UnicodeTranslateError("\ud800", 0, 1, "ouch")
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200723 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200724 # Use the correct exception
725 for enc in ("utf-8", "utf-16le", "utf-16be", "utf-32le", "utf-32be"):
726 with self.subTest(encoding=enc):
727 self.assertRaises(
728 UnicodeEncodeError,
729 surrogatepass_errors,
730 UnicodeEncodeError(enc, "a", 0, 1, "ouch")
731 )
732 self.assertRaises(
733 UnicodeDecodeError,
734 surrogatepass_errors,
735 UnicodeDecodeError(enc, "a".encode(enc), 0, 1, "ouch")
736 )
Serhiy Storchaka93f4d4c2015-03-15 23:43:34 +0200737 for s in ("\ud800", "\udfff", "\ud800\udfff"):
738 with self.subTest(str=s):
739 self.assertRaises(
740 UnicodeEncodeError,
741 surrogatepass_errors,
742 UnicodeEncodeError("ascii", s, 0, len(s), "ouch")
743 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200744 tests = [
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200745 ("utf-8", "\ud800", b'\xed\xa0\x80', 3),
746 ("utf-16le", "\ud800", b'\x00\xd8', 2),
747 ("utf-16be", "\ud800", b'\xd8\x00', 2),
748 ("utf-32le", "\ud800", b'\x00\xd8\x00\x00', 4),
749 ("utf-32be", "\ud800", b'\x00\x00\xd8\x00', 4),
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200750 ("utf-8", "\udfff", b'\xed\xbf\xbf', 3),
751 ("utf-16le", "\udfff", b'\xff\xdf', 2),
752 ("utf-16be", "\udfff", b'\xdf\xff', 2),
753 ("utf-32le", "\udfff", b'\xff\xdf\x00\x00', 4),
754 ("utf-32be", "\udfff", b'\x00\x00\xdf\xff', 4),
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200755 ("utf-8", "\ud800\udfff", b'\xed\xa0\x80\xed\xbf\xbf', 3),
756 ("utf-16le", "\ud800\udfff", b'\x00\xd8\xff\xdf', 2),
757 ("utf-16be", "\ud800\udfff", b'\xd8\x00\xdf\xff', 2),
758 ("utf-32le", "\ud800\udfff", b'\x00\xd8\x00\x00\xff\xdf\x00\x00', 4),
759 ("utf-32be", "\ud800\udfff", b'\x00\x00\xd8\x00\x00\x00\xdf\xff', 4),
760 ]
761 for enc, s, b, n in tests:
762 with self.subTest(encoding=enc, str=s, bytes=b):
763 self.assertEqual(
764 surrogatepass_errors(
765 UnicodeEncodeError(enc, s, 0, len(s), "ouch")),
766 (b, len(s))
767 )
768 self.assertEqual(
769 surrogatepass_errors(
770 UnicodeDecodeError(enc, bytearray(b[:n]), 0, n, "ouch")),
771 (s[:1], n)
772 )
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200773
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000774 def test_badhandlerresults(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000775 results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000776 encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
777
778 for res in results:
Benjamin Petersonb58dda72009-01-18 22:27:04 +0000779 codecs.register_error("test.badhandler", lambda x: res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000780 for enc in encs:
781 self.assertRaises(
782 TypeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000783 "\u3042".encode,
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000784 enc,
785 "test.badhandler"
786 )
787 for (enc, bytes) in (
Walter Dörwald00048f02007-05-09 10:44:06 +0000788 ("ascii", b"\xff"),
789 ("utf-8", b"\xff"),
790 ("utf-7", b"+x-"),
791 ("unicode-internal", b"\x00"),
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000792 ):
Ezio Melottiadc417c2011-11-17 12:23:34 +0200793 with test.support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +0100794 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +0100795 self.assertRaises(
796 TypeError,
797 bytes.decode,
798 enc,
799 "test.badhandler"
800 )
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000801
802 def test_lookup(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000803 self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
804 self.assertEqual(codecs.ignore_errors, codecs.lookup_error("ignore"))
805 self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
806 self.assertEqual(
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000807 codecs.xmlcharrefreplace_errors,
808 codecs.lookup_error("xmlcharrefreplace")
809 )
Ezio Melottib3aedd42010-11-20 19:04:17 +0000810 self.assertEqual(
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000811 codecs.backslashreplace_errors,
812 codecs.lookup_error("backslashreplace")
813 )
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200814 self.assertEqual(
815 codecs.namereplace_errors,
816 codecs.lookup_error("namereplace")
817 )
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000818
Walter Dörwald9ab7dd42002-09-06 17:21:40 +0000819 def test_unencodablereplacement(self):
820 def unencrepl(exc):
821 if isinstance(exc, UnicodeEncodeError):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000822 return ("\u4242", exc.end)
Walter Dörwald9ab7dd42002-09-06 17:21:40 +0000823 else:
824 raise TypeError("don't know how to handle %r" % exc)
825 codecs.register_error("test.unencreplhandler", unencrepl)
826 for enc in ("ascii", "iso-8859-1", "iso-8859-15"):
827 self.assertRaises(
828 UnicodeEncodeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000829 "\u4242".encode,
Walter Dörwald9ab7dd42002-09-06 17:21:40 +0000830 enc,
831 "test.unencreplhandler"
832 )
833
Walter Dörwald30537a42003-01-08 23:22:13 +0000834 def test_badregistercall(self):
835 # enhance coverage of:
836 # Modules/_codecsmodule.c::register_error()
837 # Python/codecs.c::PyCodec_RegisterError()
838 self.assertRaises(TypeError, codecs.register_error, 42)
839 self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42)
840
Walter Dörwalde22d3392005-11-17 08:52:34 +0000841 def test_badlookupcall(self):
842 # enhance coverage of:
843 # Modules/_codecsmodule.c::lookup_error()
844 self.assertRaises(TypeError, codecs.lookup_error)
845
Walter Dörwald30537a42003-01-08 23:22:13 +0000846 def test_unknownhandler(self):
847 # enhance coverage of:
848 # Modules/_codecsmodule.c::lookup_error()
849 self.assertRaises(LookupError, codecs.lookup_error, "test.unknown")
850
851 def test_xmlcharrefvalues(self):
852 # enhance coverage of:
853 # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors()
854 # and inline implementations
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200855 v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000,
856 500000, 1000000)
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000857 s = "".join([chr(x) for x in v])
Walter Dörwald30537a42003-01-08 23:22:13 +0000858 codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors)
859 for enc in ("ascii", "iso-8859-15"):
860 for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"):
861 s.encode(enc, err)
862
863 def test_decodehelper(self):
864 # enhance coverage of:
865 # Objects/unicodeobject.c::unicode_decode_call_errorhandler()
866 # and callers
Guido van Rossum09549f42007-08-27 20:40:10 +0000867 self.assertRaises(LookupError, b"\xff".decode, "ascii", "test.unknown")
Walter Dörwald30537a42003-01-08 23:22:13 +0000868
869 def baddecodereturn1(exc):
870 return 42
871 codecs.register_error("test.baddecodereturn1", baddecodereturn1)
Guido van Rossum09549f42007-08-27 20:40:10 +0000872 self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn1")
873 self.assertRaises(TypeError, b"\\".decode, "unicode-escape", "test.baddecodereturn1")
874 self.assertRaises(TypeError, b"\\x0".decode, "unicode-escape", "test.baddecodereturn1")
875 self.assertRaises(TypeError, b"\\x0y".decode, "unicode-escape", "test.baddecodereturn1")
876 self.assertRaises(TypeError, b"\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1")
877 self.assertRaises(TypeError, b"\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1")
Walter Dörwald30537a42003-01-08 23:22:13 +0000878
879 def baddecodereturn2(exc):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000880 return ("?", None)
Walter Dörwald30537a42003-01-08 23:22:13 +0000881 codecs.register_error("test.baddecodereturn2", baddecodereturn2)
Guido van Rossum09549f42007-08-27 20:40:10 +0000882 self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn2")
Walter Dörwald30537a42003-01-08 23:22:13 +0000883
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000884 handler = PosReturn()
885 codecs.register_error("test.posreturn", handler.handle)
Walter Dörwald30537a42003-01-08 23:22:13 +0000886
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000887 # Valid negative position
888 handler.pos = -1
Ezio Melottib3aedd42010-11-20 19:04:17 +0000889 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000890
891 # Valid negative position
892 handler.pos = -2
Ezio Melottib3aedd42010-11-20 19:04:17 +0000893 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?><?>")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000894
895 # Negative position out of bounds
896 handler.pos = -3
Walter Dörwald00048f02007-05-09 10:44:06 +0000897 self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000898
899 # Valid positive position
900 handler.pos = 1
Ezio Melottib3aedd42010-11-20 19:04:17 +0000901 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000902
Walter Dörwald29ddfba2004-12-14 21:28:07 +0000903 # Largest valid positive position (one beyond end of input)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000904 handler.pos = 2
Ezio Melottib3aedd42010-11-20 19:04:17 +0000905 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000906
907 # Invalid positive position
908 handler.pos = 3
Walter Dörwald00048f02007-05-09 10:44:06 +0000909 self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000910
911 # Restart at the "0"
912 handler.pos = 6
Ezio Melottib3aedd42010-11-20 19:04:17 +0000913 self.assertEqual(b"\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), "<?>0")
Walter Dörwald30537a42003-01-08 23:22:13 +0000914
915 class D(dict):
916 def __getitem__(self, key):
917 raise ValueError
Walter Dörwald00048f02007-05-09 10:44:06 +0000918 self.assertRaises(UnicodeError, codecs.charmap_decode, b"\xff", "strict", {0xff: None})
919 self.assertRaises(ValueError, codecs.charmap_decode, b"\xff", "strict", D())
920 self.assertRaises(TypeError, codecs.charmap_decode, b"\xff", "strict", {0xff: sys.maxunicode+1})
Walter Dörwald30537a42003-01-08 23:22:13 +0000921
922 def test_encodehelper(self):
923 # enhance coverage of:
924 # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
925 # and callers
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000926 self.assertRaises(LookupError, "\xff".encode, "ascii", "test.unknown")
Walter Dörwald30537a42003-01-08 23:22:13 +0000927
928 def badencodereturn1(exc):
929 return 42
930 codecs.register_error("test.badencodereturn1", badencodereturn1)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000931 self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn1")
Walter Dörwald30537a42003-01-08 23:22:13 +0000932
933 def badencodereturn2(exc):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000934 return ("?", None)
Walter Dörwald30537a42003-01-08 23:22:13 +0000935 codecs.register_error("test.badencodereturn2", badencodereturn2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000936 self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn2")
Walter Dörwald30537a42003-01-08 23:22:13 +0000937
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000938 handler = PosReturn()
939 codecs.register_error("test.posreturn", handler.handle)
Walter Dörwald30537a42003-01-08 23:22:13 +0000940
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000941 # Valid negative position
942 handler.pos = -1
Ezio Melottib3aedd42010-11-20 19:04:17 +0000943 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000944
945 # Valid negative position
946 handler.pos = -2
Ezio Melottib3aedd42010-11-20 19:04:17 +0000947 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?><?>")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000948
949 # Negative position out of bounds
950 handler.pos = -3
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000951 self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000952
953 # Valid positive position
954 handler.pos = 1
Ezio Melottib3aedd42010-11-20 19:04:17 +0000955 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000956
957 # Largest valid positive position (one beyond end of input
958 handler.pos = 2
Ezio Melottib3aedd42010-11-20 19:04:17 +0000959 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000960
961 # Invalid positive position
962 handler.pos = 3
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000963 self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000964
965 handler.pos = 0
Walter Dörwald30537a42003-01-08 23:22:13 +0000966
967 class D(dict):
968 def __getitem__(self, key):
969 raise ValueError
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200970 for err in ("strict", "replace", "xmlcharrefreplace",
971 "backslashreplace", "namereplace", "test.posreturn"):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000972 self.assertRaises(UnicodeError, codecs.charmap_encode, "\xff", err, {0xff: None})
973 self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D())
974 self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300})
Walter Dörwald30537a42003-01-08 23:22:13 +0000975
976 def test_translatehelper(self):
977 # enhance coverage of:
978 # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
979 # and callers
980 # (Unfortunately the errors argument is not directly accessible
981 # from Python, so we can't test that much)
982 class D(dict):
983 def __getitem__(self, key):
984 raise ValueError
Georg Brandledbcc132007-10-24 21:25:34 +0000985 #self.assertRaises(ValueError, "\xff".translate, D())
Victor Stinnere49a95f2014-04-05 15:35:01 +0200986 self.assertRaises(ValueError, "\xff".translate, {0xff: sys.maxunicode+1})
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000987 self.assertRaises(TypeError, "\xff".translate, {0xff: ()})
Walter Dörwald30537a42003-01-08 23:22:13 +0000988
Walter Dörwald4894c302003-10-24 14:25:28 +0000989 def test_bug828737(self):
990 charmap = {
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000991 ord("&"): "&amp;",
992 ord("<"): "&lt;",
993 ord(">"): "&gt;",
994 ord('"'): "&quot;",
Walter Dörwald4894c302003-10-24 14:25:28 +0000995 }
Tim Peters58eb11c2004-01-18 20:29:55 +0000996
Walter Dörwald4894c302003-10-24 14:25:28 +0000997 for n in (1, 10, 100, 1000):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000998 text = 'abc<def>ghi'*n
Walter Dörwald4894c302003-10-24 14:25:28 +0000999 text.translate(charmap)
1000
Walter Dörwalde78178e2007-07-30 13:31:40 +00001001 def test_mutatingdecodehandler(self):
1002 baddata = [
1003 ("ascii", b"\xff"),
1004 ("utf-7", b"++"),
1005 ("utf-8", b"\xff"),
1006 ("utf-16", b"\xff"),
Walter Dörwald41980ca2007-08-16 21:55:45 +00001007 ("utf-32", b"\xff"),
Walter Dörwalde78178e2007-07-30 13:31:40 +00001008 ("unicode-escape", b"\\u123g"),
1009 ("raw-unicode-escape", b"\\u123g"),
1010 ("unicode-internal", b"\xff"),
1011 ]
1012
1013 def replacing(exc):
1014 if isinstance(exc, UnicodeDecodeError):
1015 exc.object = 42
1016 return ("\u4242", 0)
1017 else:
1018 raise TypeError("don't know how to handle %r" % exc)
1019 codecs.register_error("test.replacing", replacing)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001020
1021 with test.support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001022 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001023 for (encoding, data) in baddata:
Ezio Melottiadc417c2011-11-17 12:23:34 +02001024 with self.assertRaises(TypeError):
1025 data.decode(encoding, "test.replacing")
Walter Dörwalde78178e2007-07-30 13:31:40 +00001026
1027 def mutating(exc):
1028 if isinstance(exc, UnicodeDecodeError):
1029 exc.object[:] = b""
1030 return ("\u4242", 0)
1031 else:
1032 raise TypeError("don't know how to handle %r" % exc)
1033 codecs.register_error("test.mutating", mutating)
1034 # If the decoder doesn't pick up the modified input the following
1035 # will lead to an endless loop
Ezio Melottiadc417c2011-11-17 12:23:34 +02001036 with test.support.check_warnings():
1037 # unicode-internal has been deprecated
1038 for (encoding, data) in baddata:
1039 with self.assertRaises(TypeError):
1040 data.decode(encoding, "test.replacing")
Walter Dörwalde78178e2007-07-30 13:31:40 +00001041
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001042
1043if __name__ == "__main__":
Brett Cannon3e9a9ae2013-06-12 21:25:59 -04001044 unittest.main()