blob: ee1e28a763ca5ce30acfb949b7a8eefa2adbf547 [file] [log] [blame]
Victor Stinner040e16e2011-11-15 22:44:05 +01001import codecs
2import html.entities
3import sys
4import test.support
5import unicodedata
6import unittest
7import warnings
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00009class PosReturn:
10 # this can be used for configurable callbacks
11
12 def __init__(self):
13 self.pos = 0
14
15 def handle(self, exc):
16 oldpos = self.pos
17 realpos = oldpos
18 if realpos<0:
Tim Petersf2715e02003-02-19 02:35:07 +000019 realpos = len(exc.object) + realpos
Walter Dörwald2e0b18a2003-01-31 17:19:08 +000020 # if we don't advance this time, terminate on the next call
21 # otherwise we'd get an endless loop
22 if realpos <= exc.start:
23 self.pos = len(exc.object)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000024 return ("<?>", oldpos)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +000025
Walter Dörwald690402f2005-11-17 18:51:34 +000026# A UnicodeEncodeError object with a bad start attribute
27class BadStartUnicodeEncodeError(UnicodeEncodeError):
28 def __init__(self):
Walter Dörwaldd2034312007-05-18 16:29:38 +000029 UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
Walter Dörwald690402f2005-11-17 18:51:34 +000030 self.start = []
31
Walter Dörwald690402f2005-11-17 18:51:34 +000032# A UnicodeEncodeError object with a bad object attribute
33class BadObjectUnicodeEncodeError(UnicodeEncodeError):
34 def __init__(self):
Walter Dörwaldd2034312007-05-18 16:29:38 +000035 UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
Walter Dörwald690402f2005-11-17 18:51:34 +000036 self.object = []
37
38# A UnicodeDecodeError object without an end attribute
39class NoEndUnicodeDecodeError(UnicodeDecodeError):
40 def __init__(self):
Guido van Rossum254348e2007-11-21 19:29:53 +000041 UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad")
Walter Dörwald690402f2005-11-17 18:51:34 +000042 del self.end
43
44# A UnicodeDecodeError object with a bad object attribute
45class BadObjectUnicodeDecodeError(UnicodeDecodeError):
46 def __init__(self):
Guido van Rossum254348e2007-11-21 19:29:53 +000047 UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad")
Walter Dörwald690402f2005-11-17 18:51:34 +000048 self.object = []
49
50# A UnicodeTranslateError object without a start attribute
51class NoStartUnicodeTranslateError(UnicodeTranslateError):
52 def __init__(self):
Walter Dörwaldd2034312007-05-18 16:29:38 +000053 UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
Walter Dörwald690402f2005-11-17 18:51:34 +000054 del self.start
55
56# A UnicodeTranslateError object without an end attribute
57class NoEndUnicodeTranslateError(UnicodeTranslateError):
58 def __init__(self):
Walter Dörwaldd2034312007-05-18 16:29:38 +000059 UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
Walter Dörwald690402f2005-11-17 18:51:34 +000060 del self.end
61
62# A UnicodeTranslateError object without an object attribute
63class NoObjectUnicodeTranslateError(UnicodeTranslateError):
64 def __init__(self):
Walter Dörwaldd2034312007-05-18 16:29:38 +000065 UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
Walter Dörwald690402f2005-11-17 18:51:34 +000066 del self.object
67
Walter Dörwald3aeb6322002-09-02 13:14:32 +000068class CodecCallbackTest(unittest.TestCase):
69
70 def test_xmlcharrefreplace(self):
71 # replace unencodable characters which numeric character entities.
72 # For ascii, latin-1 and charmaps this is completely implemented
73 # in C and should be reasonably fast.
Guido van Rossumef87d6e2007-05-02 19:09:54 +000074 s = "\u30b9\u30d1\u30e2 \xe4nd eggs"
Walter Dörwald3aeb6322002-09-02 13:14:32 +000075 self.assertEqual(
76 s.encode("ascii", "xmlcharrefreplace"),
Walter Dörwald00048f02007-05-09 10:44:06 +000077 b"&#12473;&#12497;&#12514; &#228;nd eggs"
Walter Dörwald3aeb6322002-09-02 13:14:32 +000078 )
79 self.assertEqual(
80 s.encode("latin-1", "xmlcharrefreplace"),
Walter Dörwald00048f02007-05-09 10:44:06 +000081 b"&#12473;&#12497;&#12514; \xe4nd eggs"
Walter Dörwald3aeb6322002-09-02 13:14:32 +000082 )
83
84 def test_xmlcharnamereplace(self):
85 # This time use a named character entity for unencodable
86 # characters, if one is available.
Walter Dörwald3aeb6322002-09-02 13:14:32 +000087
88 def xmlcharnamereplace(exc):
89 if not isinstance(exc, UnicodeEncodeError):
90 raise TypeError("don't know how to handle %r" % exc)
91 l = []
92 for c in exc.object[exc.start:exc.end]:
93 try:
Fred Drake3c50ea42008-05-17 22:02:32 +000094 l.append("&%s;" % html.entities.codepoint2name[ord(c)])
Walter Dörwald3aeb6322002-09-02 13:14:32 +000095 except KeyError:
Guido van Rossumef87d6e2007-05-02 19:09:54 +000096 l.append("&#%d;" % ord(c))
97 return ("".join(l), exc.end)
Walter Dörwald3aeb6322002-09-02 13:14:32 +000098
99 codecs.register_error(
100 "test.xmlcharnamereplace", xmlcharnamereplace)
101
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000102 sin = "\xab\u211c\xbb = \u2329\u1234\u20ac\u232a"
Walter Dörwald00048f02007-05-09 10:44:06 +0000103 sout = b"&laquo;&real;&raquo; = &lang;&#4660;&euro;&rang;"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000104 self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout)
Walter Dörwald00048f02007-05-09 10:44:06 +0000105 sout = b"\xab&real;\xbb = &lang;&#4660;&euro;&rang;"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000106 self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout)
Walter Dörwald00048f02007-05-09 10:44:06 +0000107 sout = b"\xab&real;\xbb = &lang;&#4660;\xa4&rang;"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000108 self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout)
109
110 def test_uninamereplace(self):
111 # We're using the names from the unicode database this time,
Walter Dörwald00445d22002-11-25 17:58:02 +0000112 # and we're doing "syntax highlighting" here, i.e. we include
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000113 # the replaced text in ANSI escape sequences. For this it is
114 # useful that the error handler is not called for every single
115 # unencodable character, but for a complete sequence of
116 # unencodable characters, otherwise we would output many
Mark Dickinson934896d2009-02-21 20:59:32 +0000117 # unnecessary escape sequences.
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000118
119 def uninamereplace(exc):
120 if not isinstance(exc, UnicodeEncodeError):
121 raise TypeError("don't know how to handle %r" % exc)
122 l = []
123 for c in exc.object[exc.start:exc.end]:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000124 l.append(unicodedata.name(c, "0x%x" % ord(c)))
125 return ("\033[1m%s\033[0m" % ", ".join(l), exc.end)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000126
127 codecs.register_error(
128 "test.uninamereplace", uninamereplace)
129
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000130 sin = "\xac\u1234\u20ac\u8000"
Walter Dörwald00048f02007-05-09 10:44:06 +0000131 sout = b"\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000132 self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout)
133
Walter Dörwald00048f02007-05-09 10:44:06 +0000134 sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000135 self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout)
136
Walter Dörwald00048f02007-05-09 10:44:06 +0000137 sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000138 self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout)
139
140 def test_backslashescape(self):
141 # Does the same as the "unicode-escape" encoding, but with different
142 # base encodings.
Ezio Melottia9860ae2011-10-04 19:06:00 +0300143 sin = "a\xac\u1234\u20ac\u8000\U0010ffff"
144 sout = b"a\\xac\\u1234\\u20ac\\u8000\\U0010ffff"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000145 self.assertEqual(sin.encode("ascii", "backslashreplace"), sout)
146
Ezio Melottia9860ae2011-10-04 19:06:00 +0300147 sout = b"a\xac\\u1234\\u20ac\\u8000\\U0010ffff"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000148 self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout)
149
Ezio Melottia9860ae2011-10-04 19:06:00 +0300150 sout = b"a\xac\\u1234\xa4\\u8000\\U0010ffff"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000151 self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
152
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200153 def test_nameescape(self):
154 # Does the same as backslashescape, but prefers ``\N{...}`` escape
155 # sequences.
156 sin = "a\xac\u1234\u20ac\u8000\U0010ffff"
157 sout = (b'a\\N{NOT SIGN}\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
158 b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
159 self.assertEqual(sin.encode("ascii", "namereplace"), sout)
160
161 sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
162 b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
163 self.assertEqual(sin.encode("latin-1", "namereplace"), sout)
164
165 sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\xa4'
166 b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
167 self.assertEqual(sin.encode("iso-8859-15", "namereplace"), sout)
168
Ezio Melotti57221d02010-07-01 07:32:02 +0000169 def test_decoding_callbacks(self):
170 # This is a test for a decoding callback handler
171 # that allows the decoding of the invalid sequence
172 # "\xc0\x80" and returns "\x00" instead of raising an error.
173 # All other illegal sequences will be handled strictly.
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000174 def relaxedutf8(exc):
175 if not isinstance(exc, UnicodeDecodeError):
176 raise TypeError("don't know how to handle %r" % exc)
Ezio Melotti57221d02010-07-01 07:32:02 +0000177 if exc.object[exc.start:exc.start+2] == b"\xc0\x80":
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000178 return ("\x00", exc.start+2) # retry after two bytes
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000179 else:
180 raise exc
181
Ezio Melotti57221d02010-07-01 07:32:02 +0000182 codecs.register_error("test.relaxedutf8", relaxedutf8)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000183
Ezio Melotti57221d02010-07-01 07:32:02 +0000184 # all the "\xc0\x80" will be decoded to "\x00"
Walter Dörwald00048f02007-05-09 10:44:06 +0000185 sin = b"a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80"
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000186 sout = "a\x00b\x00c\xfc\x00\x00"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000187 self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout)
Ezio Melotti57221d02010-07-01 07:32:02 +0000188
189 # "\xc0\x81" is not valid and a UnicodeDecodeError will be raised
Walter Dörwald00048f02007-05-09 10:44:06 +0000190 sin = b"\xc0\x80\xc0\x81"
Ezio Melotti57221d02010-07-01 07:32:02 +0000191 self.assertRaises(UnicodeDecodeError, sin.decode,
192 "utf-8", "test.relaxedutf8")
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000193
194 def test_charmapencode(self):
195 # For charmap encodings the replacement string will be
196 # mapped through the encoding again. This means, that
197 # to be able to use e.g. the "replace" handler, the
198 # charmap has to have a mapping for "?".
Guido van Rossum98297ee2007-11-06 21:34:58 +0000199 charmap = dict((ord(c), bytes(2*c.upper(), 'ascii')) for c in "abcdefgh")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000200 sin = "abc"
Walter Dörwald00048f02007-05-09 10:44:06 +0000201 sout = b"AABBCC"
Ezio Melottib3aedd42010-11-20 19:04:17 +0000202 self.assertEqual(codecs.charmap_encode(sin, "strict", charmap)[0], sout)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000203
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000204 sin = "abcA"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000205 self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap)
206
Guido van Rossum98297ee2007-11-06 21:34:58 +0000207 charmap[ord("?")] = b"XYZ"
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000208 sin = "abcDEF"
Walter Dörwald00048f02007-05-09 10:44:06 +0000209 sout = b"AABBCCXYZXYZXYZ"
Ezio Melottib3aedd42010-11-20 19:04:17 +0000210 self.assertEqual(codecs.charmap_encode(sin, "replace", charmap)[0], sout)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000211
Walter Dörwald00048f02007-05-09 10:44:06 +0000212 charmap[ord("?")] = "XYZ" # wrong type in mapping
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000213 self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
214
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000215 def test_decodeunicodeinternal(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +0200216 with test.support.check_warnings(('unicode_internal codec has been '
217 'deprecated', DeprecationWarning)):
218 self.assertRaises(
219 UnicodeDecodeError,
220 b"\x00\x00\x00\x00\x00".decode,
221 "unicode-internal",
222 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200223 if len('\0'.encode('unicode-internal')) == 4:
224 def handler_unicodeinternal(exc):
225 if not isinstance(exc, UnicodeDecodeError):
226 raise TypeError("don't know how to handle %r" % exc)
227 return ("\x01", 1)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000228
Ezio Melottiadc417c2011-11-17 12:23:34 +0200229 self.assertEqual(
230 b"\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"),
231 "\u0000"
232 )
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000233
Ezio Melottiadc417c2011-11-17 12:23:34 +0200234 self.assertEqual(
235 b"\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"),
236 "\u0000\ufffd"
237 )
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000238
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200239 self.assertEqual(
240 b"\x00\x00\x00\x00\x00".decode("unicode-internal", "backslashreplace"),
241 "\u0000\\x00"
242 )
243
Ezio Melottiadc417c2011-11-17 12:23:34 +0200244 codecs.register_error("test.hui", handler_unicodeinternal)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000245
Ezio Melottiadc417c2011-11-17 12:23:34 +0200246 self.assertEqual(
247 b"\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"),
248 "\u0000\u0001\u0000"
249 )
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000250
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000251 def test_callbacks(self):
252 def handler1(exc):
Walter Dörwald00048f02007-05-09 10:44:06 +0000253 r = range(exc.start, exc.end)
254 if isinstance(exc, UnicodeEncodeError):
255 l = ["<%d>" % ord(exc.object[pos]) for pos in r]
256 elif isinstance(exc, UnicodeDecodeError):
257 l = ["<%d>" % exc.object[pos] for pos in r]
258 else:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000259 raise TypeError("don't know how to handle %r" % exc)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000260 return ("[%s]" % "".join(l), exc.end)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000261
262 codecs.register_error("test.handler1", handler1)
263
264 def handler2(exc):
265 if not isinstance(exc, UnicodeDecodeError):
266 raise TypeError("don't know how to handle %r" % exc)
Walter Dörwald00048f02007-05-09 10:44:06 +0000267 l = ["<%d>" % exc.object[pos] for pos in range(exc.start, exc.end)]
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000268 return ("[%s]" % "".join(l), exc.end+1) # skip one character
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000269
270 codecs.register_error("test.handler2", handler2)
271
Walter Dörwald00048f02007-05-09 10:44:06 +0000272 s = b"\x00\x81\x7f\x80\xff"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000273
274 self.assertEqual(
275 s.decode("ascii", "test.handler1"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000276 "\x00[<129>]\x7f[<128>][<255>]"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000277 )
278 self.assertEqual(
279 s.decode("ascii", "test.handler2"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000280 "\x00[<129>][<128>]"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000281 )
282
283 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000284 b"\\u3042\u3xxx".decode("unicode-escape", "test.handler1"),
Serhiy Storchakad6793772013-01-29 10:20:44 +0200285 "\u3042[<92><117><51>]xxx"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000286 )
287
288 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000289 b"\\u3042\u3xx".decode("unicode-escape", "test.handler1"),
Serhiy Storchakad6793772013-01-29 10:20:44 +0200290 "\u3042[<92><117><51>]xx"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000291 )
292
293 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000294 codecs.charmap_decode(b"abc", "test.handler1", {ord("a"): "z"})[0],
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000295 "z[<98>][<99>]"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000296 )
297
298 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000299 "g\xfc\xdfrk".encode("ascii", "test.handler1"),
Walter Dörwald00048f02007-05-09 10:44:06 +0000300 b"g[<252><223>]rk"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000301 )
302
303 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000304 "g\xfc\xdf".encode("ascii", "test.handler1"),
Walter Dörwald00048f02007-05-09 10:44:06 +0000305 b"g[<252><223>]"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000306 )
307
308 def test_longstrings(self):
309 # test long strings to check for memory overflow problems
Walter Dörwald41980ca2007-08-16 21:55:45 +0000310 errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200311 "backslashreplace", "namereplace"]
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000312 # register the handlers under different names,
313 # to prevent the codec from recognizing the name
314 for err in errors:
315 codecs.register_error("test." + err, codecs.lookup_error(err))
316 l = 1000
317 errors += [ "test." + err for err in errors ]
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000318 for uni in [ s*l for s in ("x", "\u3042", "a\xe4") ]:
Walter Dörwald41980ca2007-08-16 21:55:45 +0000319 for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
320 "utf-8", "utf-7", "utf-16", "utf-32"):
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000321 for err in errors:
Tim Peters3de75262002-11-09 05:26:15 +0000322 try:
323 uni.encode(enc, err)
324 except UnicodeError:
325 pass
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000326
327 def check_exceptionobjectargs(self, exctype, args, msg):
328 # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion
329 # check with one missing argument
330 self.assertRaises(TypeError, exctype, *args[:-1])
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000331 # check with one argument too much
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000332 self.assertRaises(TypeError, exctype, *(args + ["too much"]))
333 # check with one argument of the wrong type
Guido van Rossum98297ee2007-11-06 21:34:58 +0000334 wrongargs = [ "spam", b"eggs", b"spam", 42, 1.0, None ]
Guido van Rossum805365e2007-05-07 22:24:25 +0000335 for i in range(len(args)):
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000336 for wrongarg in wrongargs:
337 if type(wrongarg) is type(args[i]):
Tim Peters3de75262002-11-09 05:26:15 +0000338 continue
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000339 # build argument array
340 callargs = []
Guido van Rossum805365e2007-05-07 22:24:25 +0000341 for j in range(len(args)):
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000342 if i==j:
343 callargs.append(wrongarg)
344 else:
345 callargs.append(args[i])
346 self.assertRaises(TypeError, exctype, *callargs)
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000347
348 # check with the correct number and type of arguments
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000349 exc = exctype(*args)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000350 self.assertEqual(str(exc), msg)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000351
352 def test_unicodeencodeerror(self):
353 self.check_exceptionobjectargs(
354 UnicodeEncodeError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000355 ["ascii", "g\xfcrk", 1, 2, "ouch"],
Walter Dörwald32a4c712007-06-20 09:25:34 +0000356 "'ascii' codec can't encode character '\\xfc' in position 1: ouch"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000357 )
358 self.check_exceptionobjectargs(
359 UnicodeEncodeError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000360 ["ascii", "g\xfcrk", 1, 4, "ouch"],
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000361 "'ascii' codec can't encode characters in position 1-3: ouch"
362 )
363 self.check_exceptionobjectargs(
364 UnicodeEncodeError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000365 ["ascii", "\xfcx", 0, 1, "ouch"],
Walter Dörwald32a4c712007-06-20 09:25:34 +0000366 "'ascii' codec can't encode character '\\xfc' in position 0: ouch"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000367 )
Walter Dörwaldfd196bd2003-08-12 17:32:43 +0000368 self.check_exceptionobjectargs(
369 UnicodeEncodeError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000370 ["ascii", "\u0100x", 0, 1, "ouch"],
Walter Dörwald32a4c712007-06-20 09:25:34 +0000371 "'ascii' codec can't encode character '\\u0100' in position 0: ouch"
Walter Dörwaldfd196bd2003-08-12 17:32:43 +0000372 )
373 self.check_exceptionobjectargs(
374 UnicodeEncodeError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000375 ["ascii", "\uffffx", 0, 1, "ouch"],
Walter Dörwald32a4c712007-06-20 09:25:34 +0000376 "'ascii' codec can't encode character '\\uffff' in position 0: ouch"
Walter Dörwaldfd196bd2003-08-12 17:32:43 +0000377 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200378 self.check_exceptionobjectargs(
379 UnicodeEncodeError,
380 ["ascii", "\U00010000x", 0, 1, "ouch"],
381 "'ascii' codec can't encode character '\\U00010000' in position 0: ouch"
382 )
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000383
384 def test_unicodedecodeerror(self):
385 self.check_exceptionobjectargs(
386 UnicodeDecodeError,
Guido van Rossum254348e2007-11-21 19:29:53 +0000387 ["ascii", bytearray(b"g\xfcrk"), 1, 2, "ouch"],
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000388 "'ascii' codec can't decode byte 0xfc in position 1: ouch"
389 )
390 self.check_exceptionobjectargs(
391 UnicodeDecodeError,
Guido van Rossum254348e2007-11-21 19:29:53 +0000392 ["ascii", bytearray(b"g\xfcrk"), 1, 3, "ouch"],
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000393 "'ascii' codec can't decode bytes in position 1-2: ouch"
394 )
395
396 def test_unicodetranslateerror(self):
397 self.check_exceptionobjectargs(
398 UnicodeTranslateError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000399 ["g\xfcrk", 1, 2, "ouch"],
Walter Dörwald32a4c712007-06-20 09:25:34 +0000400 "can't translate character '\\xfc' in position 1: ouch"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000401 )
402 self.check_exceptionobjectargs(
403 UnicodeTranslateError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000404 ["g\u0100rk", 1, 2, "ouch"],
Walter Dörwald32a4c712007-06-20 09:25:34 +0000405 "can't translate character '\\u0100' in position 1: ouch"
Walter Dörwaldfd196bd2003-08-12 17:32:43 +0000406 )
407 self.check_exceptionobjectargs(
408 UnicodeTranslateError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000409 ["g\uffffrk", 1, 2, "ouch"],
Walter Dörwald32a4c712007-06-20 09:25:34 +0000410 "can't translate character '\\uffff' in position 1: ouch"
Walter Dörwaldfd196bd2003-08-12 17:32:43 +0000411 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200412 self.check_exceptionobjectargs(
413 UnicodeTranslateError,
414 ["g\U00010000rk", 1, 2, "ouch"],
415 "can't translate character '\\U00010000' in position 1: ouch"
416 )
Walter Dörwaldfd196bd2003-08-12 17:32:43 +0000417 self.check_exceptionobjectargs(
418 UnicodeTranslateError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000419 ["g\xfcrk", 1, 3, "ouch"],
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000420 "can't translate characters in position 1-2: ouch"
421 )
422
423 def test_badandgoodstrictexceptions(self):
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000424 # "strict" complains about a non-exception passed in
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000425 self.assertRaises(
426 TypeError,
427 codecs.strict_errors,
428 42
429 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000430 # "strict" complains about the wrong exception type
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000431 self.assertRaises(
432 Exception,
433 codecs.strict_errors,
434 Exception("ouch")
435 )
436
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000437 # If the correct exception is passed in, "strict" raises it
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000438 self.assertRaises(
439 UnicodeEncodeError,
440 codecs.strict_errors,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000441 UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000442 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200443 self.assertRaises(
444 UnicodeDecodeError,
445 codecs.strict_errors,
446 UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
447 )
448 self.assertRaises(
449 UnicodeTranslateError,
450 codecs.strict_errors,
451 UnicodeTranslateError("\u3042", 0, 1, "ouch")
452 )
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000453
454 def test_badandgoodignoreexceptions(self):
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000455 # "ignore" complains about a non-exception passed in
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000456 self.assertRaises(
457 TypeError,
458 codecs.ignore_errors,
459 42
460 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000461 # "ignore" complains about the wrong exception type
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000462 self.assertRaises(
463 TypeError,
464 codecs.ignore_errors,
465 UnicodeError("ouch")
466 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000467 # If the correct exception is passed in, "ignore" returns an empty replacement
Ezio Melottib3aedd42010-11-20 19:04:17 +0000468 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000469 codecs.ignore_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200470 UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")),
471 ("", 2)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000472 )
Ezio Melottib3aedd42010-11-20 19:04:17 +0000473 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000474 codecs.ignore_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200475 UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")),
476 ("", 2)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000477 )
Ezio Melottib3aedd42010-11-20 19:04:17 +0000478 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000479 codecs.ignore_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200480 UnicodeTranslateError("a\u3042b", 1, 2, "ouch")),
481 ("", 2)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000482 )
483
484 def test_badandgoodreplaceexceptions(self):
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000485 # "replace" complains about a non-exception passed in
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000486 self.assertRaises(
487 TypeError,
488 codecs.replace_errors,
489 42
490 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000491 # "replace" complains about the wrong exception type
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000492 self.assertRaises(
493 TypeError,
494 codecs.replace_errors,
495 UnicodeError("ouch")
496 )
Walter Dörwald690402f2005-11-17 18:51:34 +0000497 self.assertRaises(
Walter Dörwald690402f2005-11-17 18:51:34 +0000498 TypeError,
499 codecs.replace_errors,
500 BadObjectUnicodeEncodeError()
501 )
502 self.assertRaises(
Walter Dörwald690402f2005-11-17 18:51:34 +0000503 TypeError,
504 codecs.replace_errors,
505 BadObjectUnicodeDecodeError()
506 )
Guido van Rossum805365e2007-05-07 22:24:25 +0000507 # With the correct exception, "replace" returns an "?" or "\ufffd" replacement
Ezio Melottib3aedd42010-11-20 19:04:17 +0000508 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000509 codecs.replace_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200510 UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")),
511 ("?", 2)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000512 )
Ezio Melottib3aedd42010-11-20 19:04:17 +0000513 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000514 codecs.replace_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200515 UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")),
516 ("\ufffd", 2)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000517 )
Ezio Melottib3aedd42010-11-20 19:04:17 +0000518 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000519 codecs.replace_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200520 UnicodeTranslateError("a\u3042b", 1, 2, "ouch")),
521 ("\ufffd", 2)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000522 )
523
524 def test_badandgoodxmlcharrefreplaceexceptions(self):
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000525 # "xmlcharrefreplace" complains about a non-exception passed in
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000526 self.assertRaises(
527 TypeError,
528 codecs.xmlcharrefreplace_errors,
529 42
530 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000531 # "xmlcharrefreplace" complains about the wrong exception types
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000532 self.assertRaises(
533 TypeError,
534 codecs.xmlcharrefreplace_errors,
535 UnicodeError("ouch")
536 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000537 # "xmlcharrefreplace" can only be used for encoding
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000538 self.assertRaises(
539 TypeError,
540 codecs.xmlcharrefreplace_errors,
Guido van Rossum254348e2007-11-21 19:29:53 +0000541 UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000542 )
543 self.assertRaises(
544 TypeError,
545 codecs.xmlcharrefreplace_errors,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000546 UnicodeTranslateError("\u3042", 0, 1, "ouch")
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000547 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000548 # Use the correct exception
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200549 cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000, 99999, 100000,
550 999999, 1000000)
551 cs += (0xd800, 0xdfff)
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000552 s = "".join(chr(c) for c in cs)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000553 self.assertEqual(
Walter Dörwald690402f2005-11-17 18:51:34 +0000554 codecs.xmlcharrefreplace_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200555 UnicodeEncodeError("ascii", "a" + s + "b",
556 1, 1 + len(s), "ouch")
Walter Dörwald690402f2005-11-17 18:51:34 +0000557 ),
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200558 ("".join("&#%d;" % c for c in cs), 1 + len(s))
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000559 )
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000560
561 def test_badandgoodbackslashreplaceexceptions(self):
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000562 # "backslashreplace" complains about a non-exception passed in
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000563 self.assertRaises(
564 TypeError,
565 codecs.backslashreplace_errors,
566 42
567 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000568 # "backslashreplace" complains about the wrong exception types
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000569 self.assertRaises(
570 TypeError,
571 codecs.backslashreplace_errors,
572 UnicodeError("ouch")
573 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000574 # Use the correct exception
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200575 tests = [
576 ("\u3042", "\\u3042"),
577 ("\n", "\\x0a"),
578 ("a", "\\x61"),
579 ("\x00", "\\x00"),
580 ("\xff", "\\xff"),
581 ("\u0100", "\\u0100"),
582 ("\uffff", "\\uffff"),
583 ("\U00010000", "\\U00010000"),
584 ("\U0010ffff", "\\U0010ffff"),
585 # Lone surrogates
586 ("\ud800", "\\ud800"),
587 ("\udfff", "\\udfff"),
588 ("\ud800\udfff", "\\ud800\\udfff"),
589 ]
590 for s, r in tests:
591 with self.subTest(str=s):
592 self.assertEqual(
593 codecs.backslashreplace_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200594 UnicodeEncodeError("ascii", "a" + s + "b",
595 1, 1 + len(s), "ouch")),
596 (r, 1 + len(s))
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200597 )
Serhiy Storchaka93f4d4c2015-03-15 23:43:34 +0200598 self.assertEqual(
599 codecs.backslashreplace_errors(
Serhiy Storchakab8a78d32015-03-16 08:31:38 +0200600 UnicodeTranslateError("a" + s + "b",
601 1, 1 + len(s), "ouch")),
602 (r, 1 + len(s))
Serhiy Storchaka93f4d4c2015-03-15 23:43:34 +0200603 )
604 tests = [
605 (b"a", "\\x61"),
606 (b"\n", "\\x0a"),
607 (b"\x00", "\\x00"),
608 (b"\xff", "\\xff"),
609 ]
610 for b, r in tests:
611 with self.subTest(bytes=b):
612 self.assertEqual(
613 codecs.backslashreplace_errors(
Serhiy Storchakab8a78d32015-03-16 08:31:38 +0200614 UnicodeDecodeError("ascii", bytearray(b"a" + b + b"b"),
615 1, 2, "ouch")),
616 (r, 2)
Serhiy Storchaka93f4d4c2015-03-15 23:43:34 +0200617 )
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000618
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200619 def test_badandgoodnamereplaceexceptions(self):
620 # "namereplace" complains about a non-exception passed in
621 self.assertRaises(
622 TypeError,
623 codecs.namereplace_errors,
624 42
625 )
626 # "namereplace" complains about the wrong exception types
627 self.assertRaises(
628 TypeError,
629 codecs.namereplace_errors,
630 UnicodeError("ouch")
631 )
632 # "namereplace" can only be used for encoding
633 self.assertRaises(
634 TypeError,
635 codecs.namereplace_errors,
636 UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
637 )
638 self.assertRaises(
639 TypeError,
640 codecs.namereplace_errors,
641 UnicodeTranslateError("\u3042", 0, 1, "ouch")
642 )
643 # Use the correct exception
Serhiy Storchaka93f4d4c2015-03-15 23:43:34 +0200644 tests = [
645 ("\u3042", "\\N{HIRAGANA LETTER A}"),
646 ("\x00", "\\x00"),
647 ("\ufbf9", "\\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH "
648 "HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}"),
649 ("\U000e007f", "\\N{CANCEL TAG}"),
650 ("\U0010ffff", "\\U0010ffff"),
651 # Lone surrogates
652 ("\ud800", "\\ud800"),
653 ("\udfff", "\\udfff"),
654 ("\ud800\udfff", "\\ud800\\udfff"),
655 ]
656 for s, r in tests:
657 with self.subTest(str=s):
658 self.assertEqual(
659 codecs.namereplace_errors(
Serhiy Storchakab8a78d32015-03-16 08:31:38 +0200660 UnicodeEncodeError("ascii", "a" + s + "b",
661 1, 1 + len(s), "ouch")),
662 (r, 1 + len(s))
Serhiy Storchaka93f4d4c2015-03-15 23:43:34 +0200663 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200664
665 def test_badandgoodsurrogateescapeexceptions(self):
666 surrogateescape_errors = codecs.lookup_error('surrogateescape')
667 # "surrogateescape" complains about a non-exception passed in
668 self.assertRaises(
669 TypeError,
670 surrogateescape_errors,
671 42
672 )
673 # "surrogateescape" complains about the wrong exception types
674 self.assertRaises(
675 TypeError,
676 surrogateescape_errors,
677 UnicodeError("ouch")
678 )
679 # "surrogateescape" can not be used for translating
680 self.assertRaises(
681 TypeError,
682 surrogateescape_errors,
683 UnicodeTranslateError("\udc80", 0, 1, "ouch")
684 )
685 # Use the correct exception
686 for s in ("a", "\udc7f", "\udd00"):
687 with self.subTest(str=s):
688 self.assertRaises(
689 UnicodeEncodeError,
690 surrogateescape_errors,
691 UnicodeEncodeError("ascii", s, 0, 1, "ouch")
692 )
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200693 self.assertEqual(
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200694 surrogateescape_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200695 UnicodeEncodeError("ascii", "a\udc80b", 1, 2, "ouch")),
696 (b"\x80", 2)
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200697 )
698 self.assertRaises(
699 UnicodeDecodeError,
700 surrogateescape_errors,
701 UnicodeDecodeError("ascii", bytearray(b"a"), 0, 1, "ouch")
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200702 )
703 self.assertEqual(
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200704 surrogateescape_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200705 UnicodeDecodeError("ascii", bytearray(b"a\x80b"), 1, 2, "ouch")),
706 ("\udc80", 2)
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200707 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200708
709 def test_badandgoodsurrogatepassexceptions(self):
710 surrogatepass_errors = codecs.lookup_error('surrogatepass')
711 # "surrogatepass" complains about a non-exception passed in
712 self.assertRaises(
713 TypeError,
714 surrogatepass_errors,
715 42
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200716 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200717 # "surrogatepass" complains about the wrong exception types
718 self.assertRaises(
719 TypeError,
720 surrogatepass_errors,
721 UnicodeError("ouch")
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200722 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200723 # "surrogatepass" can not be used for translating
724 self.assertRaises(
725 TypeError,
726 surrogatepass_errors,
727 UnicodeTranslateError("\ud800", 0, 1, "ouch")
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200728 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200729 # Use the correct exception
730 for enc in ("utf-8", "utf-16le", "utf-16be", "utf-32le", "utf-32be"):
731 with self.subTest(encoding=enc):
732 self.assertRaises(
733 UnicodeEncodeError,
734 surrogatepass_errors,
735 UnicodeEncodeError(enc, "a", 0, 1, "ouch")
736 )
737 self.assertRaises(
738 UnicodeDecodeError,
739 surrogatepass_errors,
740 UnicodeDecodeError(enc, "a".encode(enc), 0, 1, "ouch")
741 )
Serhiy Storchaka93f4d4c2015-03-15 23:43:34 +0200742 for s in ("\ud800", "\udfff", "\ud800\udfff"):
743 with self.subTest(str=s):
744 self.assertRaises(
745 UnicodeEncodeError,
746 surrogatepass_errors,
747 UnicodeEncodeError("ascii", s, 0, len(s), "ouch")
748 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200749 tests = [
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200750 ("utf-8", "\ud800", b'\xed\xa0\x80', 3),
751 ("utf-16le", "\ud800", b'\x00\xd8', 2),
752 ("utf-16be", "\ud800", b'\xd8\x00', 2),
753 ("utf-32le", "\ud800", b'\x00\xd8\x00\x00', 4),
754 ("utf-32be", "\ud800", b'\x00\x00\xd8\x00', 4),
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200755 ("utf-8", "\udfff", b'\xed\xbf\xbf', 3),
756 ("utf-16le", "\udfff", b'\xff\xdf', 2),
757 ("utf-16be", "\udfff", b'\xdf\xff', 2),
758 ("utf-32le", "\udfff", b'\xff\xdf\x00\x00', 4),
759 ("utf-32be", "\udfff", b'\x00\x00\xdf\xff', 4),
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200760 ("utf-8", "\ud800\udfff", b'\xed\xa0\x80\xed\xbf\xbf', 3),
761 ("utf-16le", "\ud800\udfff", b'\x00\xd8\xff\xdf', 2),
762 ("utf-16be", "\ud800\udfff", b'\xd8\x00\xdf\xff', 2),
763 ("utf-32le", "\ud800\udfff", b'\x00\xd8\x00\x00\xff\xdf\x00\x00', 4),
764 ("utf-32be", "\ud800\udfff", b'\x00\x00\xd8\x00\x00\x00\xdf\xff', 4),
765 ]
766 for enc, s, b, n in tests:
767 with self.subTest(encoding=enc, str=s, bytes=b):
768 self.assertEqual(
769 surrogatepass_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200770 UnicodeEncodeError(enc, "a" + s + "b",
771 1, 1 + len(s), "ouch")),
772 (b, 1 + len(s))
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200773 )
774 self.assertEqual(
775 surrogatepass_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200776 UnicodeDecodeError(enc, bytearray(b"a" + b[:n] + b"b"),
Serhiy Storchakab8a78d32015-03-16 08:31:38 +0200777 1, 1 + n, "ouch")),
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200778 (s[:1], 1 + n)
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200779 )
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200780
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000781 def test_badhandlerresults(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000782 results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000783 encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
784
785 for res in results:
Benjamin Petersonb58dda72009-01-18 22:27:04 +0000786 codecs.register_error("test.badhandler", lambda x: res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000787 for enc in encs:
788 self.assertRaises(
789 TypeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000790 "\u3042".encode,
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000791 enc,
792 "test.badhandler"
793 )
794 for (enc, bytes) in (
Walter Dörwald00048f02007-05-09 10:44:06 +0000795 ("ascii", b"\xff"),
796 ("utf-8", b"\xff"),
797 ("utf-7", b"+x-"),
798 ("unicode-internal", b"\x00"),
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000799 ):
Ezio Melottiadc417c2011-11-17 12:23:34 +0200800 with test.support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +0100801 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +0100802 self.assertRaises(
803 TypeError,
804 bytes.decode,
805 enc,
806 "test.badhandler"
807 )
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000808
809 def test_lookup(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000810 self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
811 self.assertEqual(codecs.ignore_errors, codecs.lookup_error("ignore"))
812 self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
813 self.assertEqual(
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000814 codecs.xmlcharrefreplace_errors,
815 codecs.lookup_error("xmlcharrefreplace")
816 )
Ezio Melottib3aedd42010-11-20 19:04:17 +0000817 self.assertEqual(
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000818 codecs.backslashreplace_errors,
819 codecs.lookup_error("backslashreplace")
820 )
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200821 self.assertEqual(
822 codecs.namereplace_errors,
823 codecs.lookup_error("namereplace")
824 )
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000825
Walter Dörwald9ab7dd42002-09-06 17:21:40 +0000826 def test_unencodablereplacement(self):
827 def unencrepl(exc):
828 if isinstance(exc, UnicodeEncodeError):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000829 return ("\u4242", exc.end)
Walter Dörwald9ab7dd42002-09-06 17:21:40 +0000830 else:
831 raise TypeError("don't know how to handle %r" % exc)
832 codecs.register_error("test.unencreplhandler", unencrepl)
833 for enc in ("ascii", "iso-8859-1", "iso-8859-15"):
834 self.assertRaises(
835 UnicodeEncodeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000836 "\u4242".encode,
Walter Dörwald9ab7dd42002-09-06 17:21:40 +0000837 enc,
838 "test.unencreplhandler"
839 )
840
Walter Dörwald30537a42003-01-08 23:22:13 +0000841 def test_badregistercall(self):
842 # enhance coverage of:
843 # Modules/_codecsmodule.c::register_error()
844 # Python/codecs.c::PyCodec_RegisterError()
845 self.assertRaises(TypeError, codecs.register_error, 42)
846 self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42)
847
Walter Dörwalde22d3392005-11-17 08:52:34 +0000848 def test_badlookupcall(self):
849 # enhance coverage of:
850 # Modules/_codecsmodule.c::lookup_error()
851 self.assertRaises(TypeError, codecs.lookup_error)
852
Walter Dörwald30537a42003-01-08 23:22:13 +0000853 def test_unknownhandler(self):
854 # enhance coverage of:
855 # Modules/_codecsmodule.c::lookup_error()
856 self.assertRaises(LookupError, codecs.lookup_error, "test.unknown")
857
858 def test_xmlcharrefvalues(self):
859 # enhance coverage of:
860 # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors()
861 # and inline implementations
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200862 v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000,
863 500000, 1000000)
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000864 s = "".join([chr(x) for x in v])
Walter Dörwald30537a42003-01-08 23:22:13 +0000865 codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors)
866 for enc in ("ascii", "iso-8859-15"):
867 for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"):
868 s.encode(enc, err)
869
870 def test_decodehelper(self):
871 # enhance coverage of:
872 # Objects/unicodeobject.c::unicode_decode_call_errorhandler()
873 # and callers
Guido van Rossum09549f42007-08-27 20:40:10 +0000874 self.assertRaises(LookupError, b"\xff".decode, "ascii", "test.unknown")
Walter Dörwald30537a42003-01-08 23:22:13 +0000875
876 def baddecodereturn1(exc):
877 return 42
878 codecs.register_error("test.baddecodereturn1", baddecodereturn1)
Guido van Rossum09549f42007-08-27 20:40:10 +0000879 self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn1")
880 self.assertRaises(TypeError, b"\\".decode, "unicode-escape", "test.baddecodereturn1")
881 self.assertRaises(TypeError, b"\\x0".decode, "unicode-escape", "test.baddecodereturn1")
882 self.assertRaises(TypeError, b"\\x0y".decode, "unicode-escape", "test.baddecodereturn1")
883 self.assertRaises(TypeError, b"\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1")
884 self.assertRaises(TypeError, b"\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1")
Walter Dörwald30537a42003-01-08 23:22:13 +0000885
886 def baddecodereturn2(exc):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000887 return ("?", None)
Walter Dörwald30537a42003-01-08 23:22:13 +0000888 codecs.register_error("test.baddecodereturn2", baddecodereturn2)
Guido van Rossum09549f42007-08-27 20:40:10 +0000889 self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn2")
Walter Dörwald30537a42003-01-08 23:22:13 +0000890
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000891 handler = PosReturn()
892 codecs.register_error("test.posreturn", handler.handle)
Walter Dörwald30537a42003-01-08 23:22:13 +0000893
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000894 # Valid negative position
895 handler.pos = -1
Ezio Melottib3aedd42010-11-20 19:04:17 +0000896 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000897
898 # Valid negative position
899 handler.pos = -2
Ezio Melottib3aedd42010-11-20 19:04:17 +0000900 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?><?>")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000901
902 # Negative position out of bounds
903 handler.pos = -3
Walter Dörwald00048f02007-05-09 10:44:06 +0000904 self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000905
906 # Valid positive position
907 handler.pos = 1
Ezio Melottib3aedd42010-11-20 19:04:17 +0000908 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000909
Walter Dörwald29ddfba2004-12-14 21:28:07 +0000910 # Largest valid positive position (one beyond end of input)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000911 handler.pos = 2
Ezio Melottib3aedd42010-11-20 19:04:17 +0000912 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000913
914 # Invalid positive position
915 handler.pos = 3
Walter Dörwald00048f02007-05-09 10:44:06 +0000916 self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000917
918 # Restart at the "0"
919 handler.pos = 6
Ezio Melottib3aedd42010-11-20 19:04:17 +0000920 self.assertEqual(b"\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), "<?>0")
Walter Dörwald30537a42003-01-08 23:22:13 +0000921
922 class D(dict):
923 def __getitem__(self, key):
924 raise ValueError
Walter Dörwald00048f02007-05-09 10:44:06 +0000925 self.assertRaises(UnicodeError, codecs.charmap_decode, b"\xff", "strict", {0xff: None})
926 self.assertRaises(ValueError, codecs.charmap_decode, b"\xff", "strict", D())
927 self.assertRaises(TypeError, codecs.charmap_decode, b"\xff", "strict", {0xff: sys.maxunicode+1})
Walter Dörwald30537a42003-01-08 23:22:13 +0000928
929 def test_encodehelper(self):
930 # enhance coverage of:
931 # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
932 # and callers
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000933 self.assertRaises(LookupError, "\xff".encode, "ascii", "test.unknown")
Walter Dörwald30537a42003-01-08 23:22:13 +0000934
935 def badencodereturn1(exc):
936 return 42
937 codecs.register_error("test.badencodereturn1", badencodereturn1)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000938 self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn1")
Walter Dörwald30537a42003-01-08 23:22:13 +0000939
940 def badencodereturn2(exc):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000941 return ("?", None)
Walter Dörwald30537a42003-01-08 23:22:13 +0000942 codecs.register_error("test.badencodereturn2", badencodereturn2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000943 self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn2")
Walter Dörwald30537a42003-01-08 23:22:13 +0000944
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000945 handler = PosReturn()
946 codecs.register_error("test.posreturn", handler.handle)
Walter Dörwald30537a42003-01-08 23:22:13 +0000947
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000948 # Valid negative position
949 handler.pos = -1
Ezio Melottib3aedd42010-11-20 19:04:17 +0000950 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000951
952 # Valid negative position
953 handler.pos = -2
Ezio Melottib3aedd42010-11-20 19:04:17 +0000954 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?><?>")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000955
956 # Negative position out of bounds
957 handler.pos = -3
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000958 self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000959
960 # Valid positive position
961 handler.pos = 1
Ezio Melottib3aedd42010-11-20 19:04:17 +0000962 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000963
964 # Largest valid positive position (one beyond end of input
965 handler.pos = 2
Ezio Melottib3aedd42010-11-20 19:04:17 +0000966 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000967
968 # Invalid positive position
969 handler.pos = 3
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000970 self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000971
972 handler.pos = 0
Walter Dörwald30537a42003-01-08 23:22:13 +0000973
974 class D(dict):
975 def __getitem__(self, key):
976 raise ValueError
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200977 for err in ("strict", "replace", "xmlcharrefreplace",
978 "backslashreplace", "namereplace", "test.posreturn"):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000979 self.assertRaises(UnicodeError, codecs.charmap_encode, "\xff", err, {0xff: None})
980 self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D())
981 self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300})
Walter Dörwald30537a42003-01-08 23:22:13 +0000982
983 def test_translatehelper(self):
984 # enhance coverage of:
985 # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
986 # and callers
987 # (Unfortunately the errors argument is not directly accessible
988 # from Python, so we can't test that much)
989 class D(dict):
990 def __getitem__(self, key):
991 raise ValueError
Georg Brandledbcc132007-10-24 21:25:34 +0000992 #self.assertRaises(ValueError, "\xff".translate, D())
Victor Stinnere49a95f2014-04-05 15:35:01 +0200993 self.assertRaises(ValueError, "\xff".translate, {0xff: sys.maxunicode+1})
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000994 self.assertRaises(TypeError, "\xff".translate, {0xff: ()})
Walter Dörwald30537a42003-01-08 23:22:13 +0000995
Walter Dörwald4894c302003-10-24 14:25:28 +0000996 def test_bug828737(self):
997 charmap = {
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000998 ord("&"): "&amp;",
999 ord("<"): "&lt;",
1000 ord(">"): "&gt;",
1001 ord('"'): "&quot;",
Walter Dörwald4894c302003-10-24 14:25:28 +00001002 }
Tim Peters58eb11c2004-01-18 20:29:55 +00001003
Walter Dörwald4894c302003-10-24 14:25:28 +00001004 for n in (1, 10, 100, 1000):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001005 text = 'abc<def>ghi'*n
Walter Dörwald4894c302003-10-24 14:25:28 +00001006 text.translate(charmap)
1007
Walter Dörwalde78178e2007-07-30 13:31:40 +00001008 def test_mutatingdecodehandler(self):
1009 baddata = [
1010 ("ascii", b"\xff"),
1011 ("utf-7", b"++"),
1012 ("utf-8", b"\xff"),
1013 ("utf-16", b"\xff"),
Walter Dörwald41980ca2007-08-16 21:55:45 +00001014 ("utf-32", b"\xff"),
Walter Dörwalde78178e2007-07-30 13:31:40 +00001015 ("unicode-escape", b"\\u123g"),
1016 ("raw-unicode-escape", b"\\u123g"),
1017 ("unicode-internal", b"\xff"),
1018 ]
1019
1020 def replacing(exc):
1021 if isinstance(exc, UnicodeDecodeError):
1022 exc.object = 42
1023 return ("\u4242", 0)
1024 else:
1025 raise TypeError("don't know how to handle %r" % exc)
1026 codecs.register_error("test.replacing", replacing)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001027
1028 with test.support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001029 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001030 for (encoding, data) in baddata:
Ezio Melottiadc417c2011-11-17 12:23:34 +02001031 with self.assertRaises(TypeError):
1032 data.decode(encoding, "test.replacing")
Walter Dörwalde78178e2007-07-30 13:31:40 +00001033
1034 def mutating(exc):
1035 if isinstance(exc, UnicodeDecodeError):
1036 exc.object[:] = b""
1037 return ("\u4242", 0)
1038 else:
1039 raise TypeError("don't know how to handle %r" % exc)
1040 codecs.register_error("test.mutating", mutating)
1041 # If the decoder doesn't pick up the modified input the following
1042 # will lead to an endless loop
Ezio Melottiadc417c2011-11-17 12:23:34 +02001043 with test.support.check_warnings():
1044 # unicode-internal has been deprecated
1045 for (encoding, data) in baddata:
1046 with self.assertRaises(TypeError):
1047 data.decode(encoding, "test.replacing")
Walter Dörwalde78178e2007-07-30 13:31:40 +00001048
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001049 def test_fake_error_class(self):
1050 handlers = [
1051 codecs.strict_errors,
1052 codecs.ignore_errors,
1053 codecs.replace_errors,
1054 codecs.backslashreplace_errors,
Serhiy Storchakac0937f72015-05-18 16:10:40 +03001055 codecs.namereplace_errors,
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001056 codecs.xmlcharrefreplace_errors,
1057 codecs.lookup_error('surrogateescape'),
1058 codecs.lookup_error('surrogatepass'),
1059 ]
1060 for cls in UnicodeEncodeError, UnicodeDecodeError, UnicodeTranslateError:
1061 class FakeUnicodeError(str):
1062 __class__ = cls
1063 for handler in handlers:
1064 with self.subTest(handler=handler, error_class=cls):
1065 self.assertRaises(TypeError, handler, FakeUnicodeError())
1066 class FakeUnicodeError(Exception):
1067 __class__ = cls
1068 for handler in handlers:
1069 with self.subTest(handler=handler, error_class=cls):
1070 with self.assertRaises((TypeError, FakeUnicodeError)):
1071 handler(FakeUnicodeError())
1072
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001073
1074if __name__ == "__main__":
Brett Cannon3e9a9ae2013-06-12 21:25:59 -04001075 unittest.main()