blob: cacdfae7d291d084a6c4a7f687566dd68a19c916 [file] [log] [blame]
Victor Stinner040e16e2011-11-15 22:44:05 +01001import codecs
2import html.entities
3import sys
4import test.support
5import unicodedata
6import unittest
7import warnings
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00009class PosReturn:
10 # this can be used for configurable callbacks
11
12 def __init__(self):
13 self.pos = 0
14
15 def handle(self, exc):
16 oldpos = self.pos
17 realpos = oldpos
18 if realpos<0:
Tim Petersf2715e02003-02-19 02:35:07 +000019 realpos = len(exc.object) + realpos
Walter Dörwald2e0b18a2003-01-31 17:19:08 +000020 # if we don't advance this time, terminate on the next call
21 # otherwise we'd get an endless loop
22 if realpos <= exc.start:
23 self.pos = len(exc.object)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000024 return ("<?>", oldpos)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +000025
Walter Dörwald690402f2005-11-17 18:51:34 +000026# A UnicodeEncodeError object with a bad start attribute
27class BadStartUnicodeEncodeError(UnicodeEncodeError):
28 def __init__(self):
Walter Dörwaldd2034312007-05-18 16:29:38 +000029 UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
Walter Dörwald690402f2005-11-17 18:51:34 +000030 self.start = []
31
Walter Dörwald690402f2005-11-17 18:51:34 +000032# A UnicodeEncodeError object with a bad object attribute
33class BadObjectUnicodeEncodeError(UnicodeEncodeError):
34 def __init__(self):
Walter Dörwaldd2034312007-05-18 16:29:38 +000035 UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
Walter Dörwald690402f2005-11-17 18:51:34 +000036 self.object = []
37
38# A UnicodeDecodeError object without an end attribute
39class NoEndUnicodeDecodeError(UnicodeDecodeError):
40 def __init__(self):
Guido van Rossum254348e2007-11-21 19:29:53 +000041 UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad")
Walter Dörwald690402f2005-11-17 18:51:34 +000042 del self.end
43
44# A UnicodeDecodeError object with a bad object attribute
45class BadObjectUnicodeDecodeError(UnicodeDecodeError):
46 def __init__(self):
Guido van Rossum254348e2007-11-21 19:29:53 +000047 UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad")
Walter Dörwald690402f2005-11-17 18:51:34 +000048 self.object = []
49
50# A UnicodeTranslateError object without a start attribute
51class NoStartUnicodeTranslateError(UnicodeTranslateError):
52 def __init__(self):
Walter Dörwaldd2034312007-05-18 16:29:38 +000053 UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
Walter Dörwald690402f2005-11-17 18:51:34 +000054 del self.start
55
56# A UnicodeTranslateError object without an end attribute
57class NoEndUnicodeTranslateError(UnicodeTranslateError):
58 def __init__(self):
Walter Dörwaldd2034312007-05-18 16:29:38 +000059 UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
Walter Dörwald690402f2005-11-17 18:51:34 +000060 del self.end
61
62# A UnicodeTranslateError object without an object attribute
63class NoObjectUnicodeTranslateError(UnicodeTranslateError):
64 def __init__(self):
Walter Dörwaldd2034312007-05-18 16:29:38 +000065 UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
Walter Dörwald690402f2005-11-17 18:51:34 +000066 del self.object
67
Walter Dörwald3aeb6322002-09-02 13:14:32 +000068class CodecCallbackTest(unittest.TestCase):
69
70 def test_xmlcharrefreplace(self):
71 # replace unencodable characters which numeric character entities.
72 # For ascii, latin-1 and charmaps this is completely implemented
73 # in C and should be reasonably fast.
Guido van Rossumef87d6e2007-05-02 19:09:54 +000074 s = "\u30b9\u30d1\u30e2 \xe4nd eggs"
Walter Dörwald3aeb6322002-09-02 13:14:32 +000075 self.assertEqual(
76 s.encode("ascii", "xmlcharrefreplace"),
Walter Dörwald00048f02007-05-09 10:44:06 +000077 b"&#12473;&#12497;&#12514; &#228;nd eggs"
Walter Dörwald3aeb6322002-09-02 13:14:32 +000078 )
79 self.assertEqual(
80 s.encode("latin-1", "xmlcharrefreplace"),
Walter Dörwald00048f02007-05-09 10:44:06 +000081 b"&#12473;&#12497;&#12514; \xe4nd eggs"
Walter Dörwald3aeb6322002-09-02 13:14:32 +000082 )
83
84 def test_xmlcharnamereplace(self):
85 # This time use a named character entity for unencodable
86 # characters, if one is available.
Walter Dörwald3aeb6322002-09-02 13:14:32 +000087
88 def xmlcharnamereplace(exc):
89 if not isinstance(exc, UnicodeEncodeError):
90 raise TypeError("don't know how to handle %r" % exc)
91 l = []
92 for c in exc.object[exc.start:exc.end]:
93 try:
Fred Drake3c50ea42008-05-17 22:02:32 +000094 l.append("&%s;" % html.entities.codepoint2name[ord(c)])
Walter Dörwald3aeb6322002-09-02 13:14:32 +000095 except KeyError:
Guido van Rossumef87d6e2007-05-02 19:09:54 +000096 l.append("&#%d;" % ord(c))
97 return ("".join(l), exc.end)
Walter Dörwald3aeb6322002-09-02 13:14:32 +000098
99 codecs.register_error(
100 "test.xmlcharnamereplace", xmlcharnamereplace)
101
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000102 sin = "\xab\u211c\xbb = \u2329\u1234\u20ac\u232a"
Walter Dörwald00048f02007-05-09 10:44:06 +0000103 sout = b"&laquo;&real;&raquo; = &lang;&#4660;&euro;&rang;"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000104 self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout)
Walter Dörwald00048f02007-05-09 10:44:06 +0000105 sout = b"\xab&real;\xbb = &lang;&#4660;&euro;&rang;"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000106 self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout)
Walter Dörwald00048f02007-05-09 10:44:06 +0000107 sout = b"\xab&real;\xbb = &lang;&#4660;\xa4&rang;"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000108 self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout)
109
110 def test_uninamereplace(self):
111 # We're using the names from the unicode database this time,
Walter Dörwald00445d22002-11-25 17:58:02 +0000112 # and we're doing "syntax highlighting" here, i.e. we include
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000113 # the replaced text in ANSI escape sequences. For this it is
114 # useful that the error handler is not called for every single
115 # unencodable character, but for a complete sequence of
116 # unencodable characters, otherwise we would output many
Mark Dickinson934896d2009-02-21 20:59:32 +0000117 # unnecessary escape sequences.
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000118
119 def uninamereplace(exc):
120 if not isinstance(exc, UnicodeEncodeError):
121 raise TypeError("don't know how to handle %r" % exc)
122 l = []
123 for c in exc.object[exc.start:exc.end]:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000124 l.append(unicodedata.name(c, "0x%x" % ord(c)))
125 return ("\033[1m%s\033[0m" % ", ".join(l), exc.end)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000126
127 codecs.register_error(
128 "test.uninamereplace", uninamereplace)
129
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000130 sin = "\xac\u1234\u20ac\u8000"
Walter Dörwald00048f02007-05-09 10:44:06 +0000131 sout = b"\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000132 self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout)
133
Walter Dörwald00048f02007-05-09 10:44:06 +0000134 sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000135 self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout)
136
Walter Dörwald00048f02007-05-09 10:44:06 +0000137 sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000138 self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout)
139
140 def test_backslashescape(self):
141 # Does the same as the "unicode-escape" encoding, but with different
142 # base encodings.
Ezio Melottia9860ae2011-10-04 19:06:00 +0300143 sin = "a\xac\u1234\u20ac\u8000\U0010ffff"
144 sout = b"a\\xac\\u1234\\u20ac\\u8000\\U0010ffff"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000145 self.assertEqual(sin.encode("ascii", "backslashreplace"), sout)
146
Ezio Melottia9860ae2011-10-04 19:06:00 +0300147 sout = b"a\xac\\u1234\\u20ac\\u8000\\U0010ffff"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000148 self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout)
149
Ezio Melottia9860ae2011-10-04 19:06:00 +0300150 sout = b"a\xac\\u1234\xa4\\u8000\\U0010ffff"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000151 self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
152
Ezio Melotti57221d02010-07-01 07:32:02 +0000153 def test_decoding_callbacks(self):
154 # This is a test for a decoding callback handler
155 # that allows the decoding of the invalid sequence
156 # "\xc0\x80" and returns "\x00" instead of raising an error.
157 # All other illegal sequences will be handled strictly.
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000158 def relaxedutf8(exc):
159 if not isinstance(exc, UnicodeDecodeError):
160 raise TypeError("don't know how to handle %r" % exc)
Ezio Melotti57221d02010-07-01 07:32:02 +0000161 if exc.object[exc.start:exc.start+2] == b"\xc0\x80":
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000162 return ("\x00", exc.start+2) # retry after two bytes
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000163 else:
164 raise exc
165
Ezio Melotti57221d02010-07-01 07:32:02 +0000166 codecs.register_error("test.relaxedutf8", relaxedutf8)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000167
Ezio Melotti57221d02010-07-01 07:32:02 +0000168 # all the "\xc0\x80" will be decoded to "\x00"
Walter Dörwald00048f02007-05-09 10:44:06 +0000169 sin = b"a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80"
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000170 sout = "a\x00b\x00c\xfc\x00\x00"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000171 self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout)
Ezio Melotti57221d02010-07-01 07:32:02 +0000172
173 # "\xc0\x81" is not valid and a UnicodeDecodeError will be raised
Walter Dörwald00048f02007-05-09 10:44:06 +0000174 sin = b"\xc0\x80\xc0\x81"
Ezio Melotti57221d02010-07-01 07:32:02 +0000175 self.assertRaises(UnicodeDecodeError, sin.decode,
176 "utf-8", "test.relaxedutf8")
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000177
178 def test_charmapencode(self):
179 # For charmap encodings the replacement string will be
180 # mapped through the encoding again. This means, that
181 # to be able to use e.g. the "replace" handler, the
182 # charmap has to have a mapping for "?".
Guido van Rossum98297ee2007-11-06 21:34:58 +0000183 charmap = dict((ord(c), bytes(2*c.upper(), 'ascii')) for c in "abcdefgh")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000184 sin = "abc"
Walter Dörwald00048f02007-05-09 10:44:06 +0000185 sout = b"AABBCC"
Ezio Melottib3aedd42010-11-20 19:04:17 +0000186 self.assertEqual(codecs.charmap_encode(sin, "strict", charmap)[0], sout)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000187
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000188 sin = "abcA"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000189 self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap)
190
Guido van Rossum98297ee2007-11-06 21:34:58 +0000191 charmap[ord("?")] = b"XYZ"
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000192 sin = "abcDEF"
Walter Dörwald00048f02007-05-09 10:44:06 +0000193 sout = b"AABBCCXYZXYZXYZ"
Ezio Melottib3aedd42010-11-20 19:04:17 +0000194 self.assertEqual(codecs.charmap_encode(sin, "replace", charmap)[0], sout)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000195
Walter Dörwald00048f02007-05-09 10:44:06 +0000196 charmap[ord("?")] = "XYZ" # wrong type in mapping
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000197 self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
198
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000199 def test_decodeunicodeinternal(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +0200200 with test.support.check_warnings(('unicode_internal codec has been '
201 'deprecated', DeprecationWarning)):
202 self.assertRaises(
203 UnicodeDecodeError,
204 b"\x00\x00\x00\x00\x00".decode,
205 "unicode-internal",
206 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200207 if len('\0'.encode('unicode-internal')) == 4:
208 def handler_unicodeinternal(exc):
209 if not isinstance(exc, UnicodeDecodeError):
210 raise TypeError("don't know how to handle %r" % exc)
211 return ("\x01", 1)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000212
Ezio Melottiadc417c2011-11-17 12:23:34 +0200213 self.assertEqual(
214 b"\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"),
215 "\u0000"
216 )
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000217
Ezio Melottiadc417c2011-11-17 12:23:34 +0200218 self.assertEqual(
219 b"\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"),
220 "\u0000\ufffd"
221 )
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000222
Ezio Melottiadc417c2011-11-17 12:23:34 +0200223 codecs.register_error("test.hui", handler_unicodeinternal)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000224
Ezio Melottiadc417c2011-11-17 12:23:34 +0200225 self.assertEqual(
226 b"\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"),
227 "\u0000\u0001\u0000"
228 )
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000229
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000230 def test_callbacks(self):
231 def handler1(exc):
Walter Dörwald00048f02007-05-09 10:44:06 +0000232 r = range(exc.start, exc.end)
233 if isinstance(exc, UnicodeEncodeError):
234 l = ["<%d>" % ord(exc.object[pos]) for pos in r]
235 elif isinstance(exc, UnicodeDecodeError):
236 l = ["<%d>" % exc.object[pos] for pos in r]
237 else:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000238 raise TypeError("don't know how to handle %r" % exc)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000239 return ("[%s]" % "".join(l), exc.end)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000240
241 codecs.register_error("test.handler1", handler1)
242
243 def handler2(exc):
244 if not isinstance(exc, UnicodeDecodeError):
245 raise TypeError("don't know how to handle %r" % exc)
Walter Dörwald00048f02007-05-09 10:44:06 +0000246 l = ["<%d>" % exc.object[pos] for pos in range(exc.start, exc.end)]
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000247 return ("[%s]" % "".join(l), exc.end+1) # skip one character
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000248
249 codecs.register_error("test.handler2", handler2)
250
Walter Dörwald00048f02007-05-09 10:44:06 +0000251 s = b"\x00\x81\x7f\x80\xff"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000252
253 self.assertEqual(
254 s.decode("ascii", "test.handler1"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000255 "\x00[<129>]\x7f[<128>][<255>]"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000256 )
257 self.assertEqual(
258 s.decode("ascii", "test.handler2"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000259 "\x00[<129>][<128>]"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000260 )
261
262 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000263 b"\\u3042\u3xxx".decode("unicode-escape", "test.handler1"),
Serhiy Storchakad6793772013-01-29 10:20:44 +0200264 "\u3042[<92><117><51>]xxx"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000265 )
266
267 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000268 b"\\u3042\u3xx".decode("unicode-escape", "test.handler1"),
Serhiy Storchakad6793772013-01-29 10:20:44 +0200269 "\u3042[<92><117><51>]xx"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000270 )
271
272 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000273 codecs.charmap_decode(b"abc", "test.handler1", {ord("a"): "z"})[0],
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000274 "z[<98>][<99>]"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000275 )
276
277 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000278 "g\xfc\xdfrk".encode("ascii", "test.handler1"),
Walter Dörwald00048f02007-05-09 10:44:06 +0000279 b"g[<252><223>]rk"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000280 )
281
282 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000283 "g\xfc\xdf".encode("ascii", "test.handler1"),
Walter Dörwald00048f02007-05-09 10:44:06 +0000284 b"g[<252><223>]"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000285 )
286
287 def test_longstrings(self):
288 # test long strings to check for memory overflow problems
Walter Dörwald41980ca2007-08-16 21:55:45 +0000289 errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
290 "backslashreplace"]
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000291 # register the handlers under different names,
292 # to prevent the codec from recognizing the name
293 for err in errors:
294 codecs.register_error("test." + err, codecs.lookup_error(err))
295 l = 1000
296 errors += [ "test." + err for err in errors ]
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000297 for uni in [ s*l for s in ("x", "\u3042", "a\xe4") ]:
Walter Dörwald41980ca2007-08-16 21:55:45 +0000298 for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
299 "utf-8", "utf-7", "utf-16", "utf-32"):
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000300 for err in errors:
Tim Peters3de75262002-11-09 05:26:15 +0000301 try:
302 uni.encode(enc, err)
303 except UnicodeError:
304 pass
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000305
306 def check_exceptionobjectargs(self, exctype, args, msg):
307 # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion
308 # check with one missing argument
309 self.assertRaises(TypeError, exctype, *args[:-1])
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000310 # check with one argument too much
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000311 self.assertRaises(TypeError, exctype, *(args + ["too much"]))
312 # check with one argument of the wrong type
Guido van Rossum98297ee2007-11-06 21:34:58 +0000313 wrongargs = [ "spam", b"eggs", b"spam", 42, 1.0, None ]
Guido van Rossum805365e2007-05-07 22:24:25 +0000314 for i in range(len(args)):
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000315 for wrongarg in wrongargs:
316 if type(wrongarg) is type(args[i]):
Tim Peters3de75262002-11-09 05:26:15 +0000317 continue
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000318 # build argument array
319 callargs = []
Guido van Rossum805365e2007-05-07 22:24:25 +0000320 for j in range(len(args)):
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000321 if i==j:
322 callargs.append(wrongarg)
323 else:
324 callargs.append(args[i])
325 self.assertRaises(TypeError, exctype, *callargs)
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000326
327 # check with the correct number and type of arguments
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000328 exc = exctype(*args)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000329 self.assertEqual(str(exc), msg)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000330
331 def test_unicodeencodeerror(self):
332 self.check_exceptionobjectargs(
333 UnicodeEncodeError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000334 ["ascii", "g\xfcrk", 1, 2, "ouch"],
Walter Dörwald32a4c712007-06-20 09:25:34 +0000335 "'ascii' codec can't encode character '\\xfc' in position 1: ouch"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000336 )
337 self.check_exceptionobjectargs(
338 UnicodeEncodeError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000339 ["ascii", "g\xfcrk", 1, 4, "ouch"],
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000340 "'ascii' codec can't encode characters in position 1-3: ouch"
341 )
342 self.check_exceptionobjectargs(
343 UnicodeEncodeError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000344 ["ascii", "\xfcx", 0, 1, "ouch"],
Walter Dörwald32a4c712007-06-20 09:25:34 +0000345 "'ascii' codec can't encode character '\\xfc' in position 0: ouch"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000346 )
Walter Dörwaldfd196bd2003-08-12 17:32:43 +0000347 self.check_exceptionobjectargs(
348 UnicodeEncodeError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000349 ["ascii", "\u0100x", 0, 1, "ouch"],
Walter Dörwald32a4c712007-06-20 09:25:34 +0000350 "'ascii' codec can't encode character '\\u0100' in position 0: ouch"
Walter Dörwaldfd196bd2003-08-12 17:32:43 +0000351 )
352 self.check_exceptionobjectargs(
353 UnicodeEncodeError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000354 ["ascii", "\uffffx", 0, 1, "ouch"],
Walter Dörwald32a4c712007-06-20 09:25:34 +0000355 "'ascii' codec can't encode character '\\uffff' in position 0: ouch"
Walter Dörwaldfd196bd2003-08-12 17:32:43 +0000356 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200357 self.check_exceptionobjectargs(
358 UnicodeEncodeError,
359 ["ascii", "\U00010000x", 0, 1, "ouch"],
360 "'ascii' codec can't encode character '\\U00010000' in position 0: ouch"
361 )
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000362
363 def test_unicodedecodeerror(self):
364 self.check_exceptionobjectargs(
365 UnicodeDecodeError,
Guido van Rossum254348e2007-11-21 19:29:53 +0000366 ["ascii", bytearray(b"g\xfcrk"), 1, 2, "ouch"],
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000367 "'ascii' codec can't decode byte 0xfc in position 1: ouch"
368 )
369 self.check_exceptionobjectargs(
370 UnicodeDecodeError,
Guido van Rossum254348e2007-11-21 19:29:53 +0000371 ["ascii", bytearray(b"g\xfcrk"), 1, 3, "ouch"],
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000372 "'ascii' codec can't decode bytes in position 1-2: ouch"
373 )
374
375 def test_unicodetranslateerror(self):
376 self.check_exceptionobjectargs(
377 UnicodeTranslateError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000378 ["g\xfcrk", 1, 2, "ouch"],
Walter Dörwald32a4c712007-06-20 09:25:34 +0000379 "can't translate character '\\xfc' in position 1: ouch"
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000380 )
381 self.check_exceptionobjectargs(
382 UnicodeTranslateError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000383 ["g\u0100rk", 1, 2, "ouch"],
Walter Dörwald32a4c712007-06-20 09:25:34 +0000384 "can't translate character '\\u0100' in position 1: ouch"
Walter Dörwaldfd196bd2003-08-12 17:32:43 +0000385 )
386 self.check_exceptionobjectargs(
387 UnicodeTranslateError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000388 ["g\uffffrk", 1, 2, "ouch"],
Walter Dörwald32a4c712007-06-20 09:25:34 +0000389 "can't translate character '\\uffff' in position 1: ouch"
Walter Dörwaldfd196bd2003-08-12 17:32:43 +0000390 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200391 self.check_exceptionobjectargs(
392 UnicodeTranslateError,
393 ["g\U00010000rk", 1, 2, "ouch"],
394 "can't translate character '\\U00010000' in position 1: ouch"
395 )
Walter Dörwaldfd196bd2003-08-12 17:32:43 +0000396 self.check_exceptionobjectargs(
397 UnicodeTranslateError,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000398 ["g\xfcrk", 1, 3, "ouch"],
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000399 "can't translate characters in position 1-2: ouch"
400 )
401
402 def test_badandgoodstrictexceptions(self):
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000403 # "strict" complains about a non-exception passed in
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000404 self.assertRaises(
405 TypeError,
406 codecs.strict_errors,
407 42
408 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000409 # "strict" complains about the wrong exception type
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000410 self.assertRaises(
411 Exception,
412 codecs.strict_errors,
413 Exception("ouch")
414 )
415
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000416 # If the correct exception is passed in, "strict" raises it
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000417 self.assertRaises(
418 UnicodeEncodeError,
419 codecs.strict_errors,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000420 UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000421 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200422 self.assertRaises(
423 UnicodeDecodeError,
424 codecs.strict_errors,
425 UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
426 )
427 self.assertRaises(
428 UnicodeTranslateError,
429 codecs.strict_errors,
430 UnicodeTranslateError("\u3042", 0, 1, "ouch")
431 )
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000432
433 def test_badandgoodignoreexceptions(self):
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000434 # "ignore" complains about a non-exception passed in
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000435 self.assertRaises(
436 TypeError,
437 codecs.ignore_errors,
438 42
439 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000440 # "ignore" complains about the wrong exception type
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000441 self.assertRaises(
442 TypeError,
443 codecs.ignore_errors,
444 UnicodeError("ouch")
445 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000446 # If the correct exception is passed in, "ignore" returns an empty replacement
Ezio Melottib3aedd42010-11-20 19:04:17 +0000447 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000448 codecs.ignore_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200449 UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")),
450 ("", 2)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000451 )
Ezio Melottib3aedd42010-11-20 19:04:17 +0000452 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000453 codecs.ignore_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200454 UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")),
455 ("", 2)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000456 )
Ezio Melottib3aedd42010-11-20 19:04:17 +0000457 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000458 codecs.ignore_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200459 UnicodeTranslateError("a\u3042b", 1, 2, "ouch")),
460 ("", 2)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000461 )
462
463 def test_badandgoodreplaceexceptions(self):
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000464 # "replace" complains about a non-exception passed in
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000465 self.assertRaises(
466 TypeError,
467 codecs.replace_errors,
468 42
469 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000470 # "replace" complains about the wrong exception type
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000471 self.assertRaises(
472 TypeError,
473 codecs.replace_errors,
474 UnicodeError("ouch")
475 )
Walter Dörwald690402f2005-11-17 18:51:34 +0000476 self.assertRaises(
Walter Dörwald690402f2005-11-17 18:51:34 +0000477 TypeError,
478 codecs.replace_errors,
479 BadObjectUnicodeEncodeError()
480 )
481 self.assertRaises(
Walter Dörwald690402f2005-11-17 18:51:34 +0000482 TypeError,
483 codecs.replace_errors,
484 BadObjectUnicodeDecodeError()
485 )
Guido van Rossum805365e2007-05-07 22:24:25 +0000486 # With the correct exception, "replace" returns an "?" or "\ufffd" replacement
Ezio Melottib3aedd42010-11-20 19:04:17 +0000487 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000488 codecs.replace_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200489 UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")),
490 ("?", 2)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000491 )
Ezio Melottib3aedd42010-11-20 19:04:17 +0000492 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000493 codecs.replace_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200494 UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")),
495 ("\ufffd", 2)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000496 )
Ezio Melottib3aedd42010-11-20 19:04:17 +0000497 self.assertEqual(
Walter Dörwald00048f02007-05-09 10:44:06 +0000498 codecs.replace_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200499 UnicodeTranslateError("a\u3042b", 1, 2, "ouch")),
500 ("\ufffd", 2)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000501 )
502
503 def test_badandgoodxmlcharrefreplaceexceptions(self):
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000504 # "xmlcharrefreplace" complains about a non-exception passed in
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000505 self.assertRaises(
506 TypeError,
507 codecs.xmlcharrefreplace_errors,
508 42
509 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000510 # "xmlcharrefreplace" complains about the wrong exception types
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000511 self.assertRaises(
512 TypeError,
513 codecs.xmlcharrefreplace_errors,
514 UnicodeError("ouch")
515 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000516 # "xmlcharrefreplace" can only be used for encoding
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000517 self.assertRaises(
518 TypeError,
519 codecs.xmlcharrefreplace_errors,
Guido van Rossum254348e2007-11-21 19:29:53 +0000520 UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000521 )
522 self.assertRaises(
523 TypeError,
524 codecs.xmlcharrefreplace_errors,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000525 UnicodeTranslateError("\u3042", 0, 1, "ouch")
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000526 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000527 # Use the correct exception
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200528 cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000, 99999, 100000,
529 999999, 1000000)
530 cs += (0xd800, 0xdfff)
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000531 s = "".join(chr(c) for c in cs)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000532 self.assertEqual(
Walter Dörwald690402f2005-11-17 18:51:34 +0000533 codecs.xmlcharrefreplace_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200534 UnicodeEncodeError("ascii", "a" + s + "b",
535 1, 1 + len(s), "ouch")
Walter Dörwald690402f2005-11-17 18:51:34 +0000536 ),
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200537 ("".join("&#%d;" % c for c in cs), 1 + len(s))
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000538 )
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000539
540 def test_badandgoodbackslashreplaceexceptions(self):
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000541 # "backslashreplace" complains about a non-exception passed in
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000542 self.assertRaises(
543 TypeError,
544 codecs.backslashreplace_errors,
545 42
546 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000547 # "backslashreplace" complains about the wrong exception types
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000548 self.assertRaises(
549 TypeError,
550 codecs.backslashreplace_errors,
551 UnicodeError("ouch")
552 )
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000553 # "backslashreplace" can only be used for encoding
554 self.assertRaises(
555 TypeError,
556 codecs.backslashreplace_errors,
Guido van Rossum254348e2007-11-21 19:29:53 +0000557 UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000558 )
559 self.assertRaises(
560 TypeError,
561 codecs.backslashreplace_errors,
Walter Dörwaldd2034312007-05-18 16:29:38 +0000562 UnicodeTranslateError("\u3042", 0, 1, "ouch")
Walter Dörwaldea4250d2003-01-20 02:34:07 +0000563 )
564 # Use the correct exception
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200565 tests = [
566 ("\u3042", "\\u3042"),
567 ("\n", "\\x0a"),
568 ("a", "\\x61"),
569 ("\x00", "\\x00"),
570 ("\xff", "\\xff"),
571 ("\u0100", "\\u0100"),
572 ("\uffff", "\\uffff"),
573 ("\U00010000", "\\U00010000"),
574 ("\U0010ffff", "\\U0010ffff"),
575 # Lone surrogates
576 ("\ud800", "\\ud800"),
577 ("\udfff", "\\udfff"),
578 ("\ud800\udfff", "\\ud800\\udfff"),
579 ]
580 for s, r in tests:
581 with self.subTest(str=s):
582 self.assertEqual(
583 codecs.backslashreplace_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200584 UnicodeEncodeError("ascii", "a" + s + "b",
585 1, 1 + len(s), "ouch")),
586 (r, 1 + len(s))
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200587 )
588
589 def test_badandgoodsurrogateescapeexceptions(self):
590 surrogateescape_errors = codecs.lookup_error('surrogateescape')
591 # "surrogateescape" complains about a non-exception passed in
592 self.assertRaises(
593 TypeError,
594 surrogateescape_errors,
595 42
596 )
597 # "surrogateescape" complains about the wrong exception types
598 self.assertRaises(
599 TypeError,
600 surrogateescape_errors,
601 UnicodeError("ouch")
602 )
603 # "surrogateescape" can not be used for translating
604 self.assertRaises(
605 TypeError,
606 surrogateescape_errors,
607 UnicodeTranslateError("\udc80", 0, 1, "ouch")
608 )
609 # Use the correct exception
610 for s in ("a", "\udc7f", "\udd00"):
611 with self.subTest(str=s):
612 self.assertRaises(
613 UnicodeEncodeError,
614 surrogateescape_errors,
615 UnicodeEncodeError("ascii", s, 0, 1, "ouch")
616 )
Ezio Melottib3aedd42010-11-20 19:04:17 +0000617 self.assertEqual(
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200618 surrogateescape_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200619 UnicodeEncodeError("ascii", "a\udc80b", 1, 2, "ouch")),
620 (b"\x80", 2)
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200621 )
622 self.assertRaises(
623 UnicodeDecodeError,
624 surrogateescape_errors,
625 UnicodeDecodeError("ascii", bytearray(b"a"), 0, 1, "ouch")
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000626 )
Ezio Melottib3aedd42010-11-20 19:04:17 +0000627 self.assertEqual(
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200628 surrogateescape_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200629 UnicodeDecodeError("ascii", bytearray(b"a\x80b"), 1, 2, "ouch")),
630 ("\udc80", 2)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000631 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200632
633 def test_badandgoodsurrogatepassexceptions(self):
634 surrogatepass_errors = codecs.lookup_error('surrogatepass')
635 # "surrogatepass" complains about a non-exception passed in
636 self.assertRaises(
637 TypeError,
638 surrogatepass_errors,
639 42
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000640 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200641 # "surrogatepass" complains about the wrong exception types
642 self.assertRaises(
643 TypeError,
644 surrogatepass_errors,
645 UnicodeError("ouch")
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000646 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200647 # "surrogatepass" can not be used for translating
648 self.assertRaises(
649 TypeError,
650 surrogatepass_errors,
651 UnicodeTranslateError("\ud800", 0, 1, "ouch")
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000652 )
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200653 # Use the correct exception
654 for enc in ("utf-8", "utf-16le", "utf-16be", "utf-32le", "utf-32be"):
655 with self.subTest(encoding=enc):
656 self.assertRaises(
657 UnicodeEncodeError,
658 surrogatepass_errors,
659 UnicodeEncodeError(enc, "a", 0, 1, "ouch")
660 )
661 self.assertRaises(
662 UnicodeDecodeError,
663 surrogatepass_errors,
664 UnicodeDecodeError(enc, "a".encode(enc), 0, 1, "ouch")
665 )
666 tests = [
667 ("ascii", "\ud800", b'\xed\xa0\x80', 3),
668 ("utf-8", "\ud800", b'\xed\xa0\x80', 3),
669 ("utf-16le", "\ud800", b'\x00\xd8', 2),
670 ("utf-16be", "\ud800", b'\xd8\x00', 2),
671 ("utf-32le", "\ud800", b'\x00\xd8\x00\x00', 4),
672 ("utf-32be", "\ud800", b'\x00\x00\xd8\x00', 4),
673 ("ascii", "\udfff", b'\xed\xbf\xbf', 3),
674 ("utf-8", "\udfff", b'\xed\xbf\xbf', 3),
675 ("utf-16le", "\udfff", b'\xff\xdf', 2),
676 ("utf-16be", "\udfff", b'\xdf\xff', 2),
677 ("utf-32le", "\udfff", b'\xff\xdf\x00\x00', 4),
678 ("utf-32be", "\udfff", b'\x00\x00\xdf\xff', 4),
679 ("ascii", "\ud800\udfff", b'\xed\xa0\x80\xed\xbf\xbf', 3),
680 ("utf-8", "\ud800\udfff", b'\xed\xa0\x80\xed\xbf\xbf', 3),
681 ("utf-16le", "\ud800\udfff", b'\x00\xd8\xff\xdf', 2),
682 ("utf-16be", "\ud800\udfff", b'\xd8\x00\xdf\xff', 2),
683 ("utf-32le", "\ud800\udfff", b'\x00\xd8\x00\x00\xff\xdf\x00\x00', 4),
684 ("utf-32be", "\ud800\udfff", b'\x00\x00\xd8\x00\x00\x00\xdf\xff', 4),
685 ]
686 for enc, s, b, n in tests:
687 with self.subTest(encoding=enc, str=s, bytes=b):
688 self.assertEqual(
689 surrogatepass_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200690 UnicodeEncodeError(enc, "a" + s + "b",
691 1, 1 + len(s), "ouch")),
692 (b, 1 + len(s))
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200693 )
694 self.assertEqual(
695 surrogatepass_errors(
Serhiy Storchaka05d54732015-03-16 08:29:47 +0200696 UnicodeDecodeError(enc, bytearray(b"a" + b[:n] + b"b"),
697 1, n, "ouch")),
698 (s[:1], 1 + n)
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200699 )
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000700
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000701 def test_badhandlerresults(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000702 results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000703 encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
704
705 for res in results:
Benjamin Petersonb58dda72009-01-18 22:27:04 +0000706 codecs.register_error("test.badhandler", lambda x: res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000707 for enc in encs:
708 self.assertRaises(
709 TypeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000710 "\u3042".encode,
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000711 enc,
712 "test.badhandler"
713 )
714 for (enc, bytes) in (
Walter Dörwald00048f02007-05-09 10:44:06 +0000715 ("ascii", b"\xff"),
716 ("utf-8", b"\xff"),
717 ("utf-7", b"+x-"),
718 ("unicode-internal", b"\x00"),
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000719 ):
Ezio Melottiadc417c2011-11-17 12:23:34 +0200720 with test.support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +0100721 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +0100722 self.assertRaises(
723 TypeError,
724 bytes.decode,
725 enc,
726 "test.badhandler"
727 )
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000728
729 def test_lookup(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000730 self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
731 self.assertEqual(codecs.ignore_errors, codecs.lookup_error("ignore"))
732 self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
733 self.assertEqual(
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000734 codecs.xmlcharrefreplace_errors,
735 codecs.lookup_error("xmlcharrefreplace")
736 )
Ezio Melottib3aedd42010-11-20 19:04:17 +0000737 self.assertEqual(
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000738 codecs.backslashreplace_errors,
739 codecs.lookup_error("backslashreplace")
740 )
741
Walter Dörwald9ab7dd42002-09-06 17:21:40 +0000742 def test_unencodablereplacement(self):
743 def unencrepl(exc):
744 if isinstance(exc, UnicodeEncodeError):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000745 return ("\u4242", exc.end)
Walter Dörwald9ab7dd42002-09-06 17:21:40 +0000746 else:
747 raise TypeError("don't know how to handle %r" % exc)
748 codecs.register_error("test.unencreplhandler", unencrepl)
749 for enc in ("ascii", "iso-8859-1", "iso-8859-15"):
750 self.assertRaises(
751 UnicodeEncodeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000752 "\u4242".encode,
Walter Dörwald9ab7dd42002-09-06 17:21:40 +0000753 enc,
754 "test.unencreplhandler"
755 )
756
Walter Dörwald30537a42003-01-08 23:22:13 +0000757 def test_badregistercall(self):
758 # enhance coverage of:
759 # Modules/_codecsmodule.c::register_error()
760 # Python/codecs.c::PyCodec_RegisterError()
761 self.assertRaises(TypeError, codecs.register_error, 42)
762 self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42)
763
Walter Dörwalde22d3392005-11-17 08:52:34 +0000764 def test_badlookupcall(self):
765 # enhance coverage of:
766 # Modules/_codecsmodule.c::lookup_error()
767 self.assertRaises(TypeError, codecs.lookup_error)
768
Walter Dörwald30537a42003-01-08 23:22:13 +0000769 def test_unknownhandler(self):
770 # enhance coverage of:
771 # Modules/_codecsmodule.c::lookup_error()
772 self.assertRaises(LookupError, codecs.lookup_error, "test.unknown")
773
774 def test_xmlcharrefvalues(self):
775 # enhance coverage of:
776 # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors()
777 # and inline implementations
Serhiy Storchaka98d156b2015-03-15 23:41:37 +0200778 v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000,
779 500000, 1000000)
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000780 s = "".join([chr(x) for x in v])
Walter Dörwald30537a42003-01-08 23:22:13 +0000781 codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors)
782 for enc in ("ascii", "iso-8859-15"):
783 for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"):
784 s.encode(enc, err)
785
786 def test_decodehelper(self):
787 # enhance coverage of:
788 # Objects/unicodeobject.c::unicode_decode_call_errorhandler()
789 # and callers
Guido van Rossum09549f42007-08-27 20:40:10 +0000790 self.assertRaises(LookupError, b"\xff".decode, "ascii", "test.unknown")
Walter Dörwald30537a42003-01-08 23:22:13 +0000791
792 def baddecodereturn1(exc):
793 return 42
794 codecs.register_error("test.baddecodereturn1", baddecodereturn1)
Guido van Rossum09549f42007-08-27 20:40:10 +0000795 self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn1")
796 self.assertRaises(TypeError, b"\\".decode, "unicode-escape", "test.baddecodereturn1")
797 self.assertRaises(TypeError, b"\\x0".decode, "unicode-escape", "test.baddecodereturn1")
798 self.assertRaises(TypeError, b"\\x0y".decode, "unicode-escape", "test.baddecodereturn1")
799 self.assertRaises(TypeError, b"\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1")
800 self.assertRaises(TypeError, b"\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1")
Walter Dörwald30537a42003-01-08 23:22:13 +0000801
802 def baddecodereturn2(exc):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000803 return ("?", None)
Walter Dörwald30537a42003-01-08 23:22:13 +0000804 codecs.register_error("test.baddecodereturn2", baddecodereturn2)
Guido van Rossum09549f42007-08-27 20:40:10 +0000805 self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn2")
Walter Dörwald30537a42003-01-08 23:22:13 +0000806
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000807 handler = PosReturn()
808 codecs.register_error("test.posreturn", handler.handle)
Walter Dörwald30537a42003-01-08 23:22:13 +0000809
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000810 # Valid negative position
811 handler.pos = -1
Ezio Melottib3aedd42010-11-20 19:04:17 +0000812 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000813
814 # Valid negative position
815 handler.pos = -2
Ezio Melottib3aedd42010-11-20 19:04:17 +0000816 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?><?>")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000817
818 # Negative position out of bounds
819 handler.pos = -3
Walter Dörwald00048f02007-05-09 10:44:06 +0000820 self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000821
822 # Valid positive position
823 handler.pos = 1
Ezio Melottib3aedd42010-11-20 19:04:17 +0000824 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000825
Walter Dörwald29ddfba2004-12-14 21:28:07 +0000826 # Largest valid positive position (one beyond end of input)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000827 handler.pos = 2
Ezio Melottib3aedd42010-11-20 19:04:17 +0000828 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000829
830 # Invalid positive position
831 handler.pos = 3
Walter Dörwald00048f02007-05-09 10:44:06 +0000832 self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000833
834 # Restart at the "0"
835 handler.pos = 6
Ezio Melottib3aedd42010-11-20 19:04:17 +0000836 self.assertEqual(b"\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), "<?>0")
Walter Dörwald30537a42003-01-08 23:22:13 +0000837
838 class D(dict):
839 def __getitem__(self, key):
840 raise ValueError
Walter Dörwald00048f02007-05-09 10:44:06 +0000841 self.assertRaises(UnicodeError, codecs.charmap_decode, b"\xff", "strict", {0xff: None})
842 self.assertRaises(ValueError, codecs.charmap_decode, b"\xff", "strict", D())
843 self.assertRaises(TypeError, codecs.charmap_decode, b"\xff", "strict", {0xff: sys.maxunicode+1})
Walter Dörwald30537a42003-01-08 23:22:13 +0000844
845 def test_encodehelper(self):
846 # enhance coverage of:
847 # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
848 # and callers
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000849 self.assertRaises(LookupError, "\xff".encode, "ascii", "test.unknown")
Walter Dörwald30537a42003-01-08 23:22:13 +0000850
851 def badencodereturn1(exc):
852 return 42
853 codecs.register_error("test.badencodereturn1", badencodereturn1)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000854 self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn1")
Walter Dörwald30537a42003-01-08 23:22:13 +0000855
856 def badencodereturn2(exc):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000857 return ("?", None)
Walter Dörwald30537a42003-01-08 23:22:13 +0000858 codecs.register_error("test.badencodereturn2", badencodereturn2)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000859 self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn2")
Walter Dörwald30537a42003-01-08 23:22:13 +0000860
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000861 handler = PosReturn()
862 codecs.register_error("test.posreturn", handler.handle)
Walter Dörwald30537a42003-01-08 23:22:13 +0000863
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000864 # Valid negative position
865 handler.pos = -1
Ezio Melottib3aedd42010-11-20 19:04:17 +0000866 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000867
868 # Valid negative position
869 handler.pos = -2
Ezio Melottib3aedd42010-11-20 19:04:17 +0000870 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?><?>")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000871
872 # Negative position out of bounds
873 handler.pos = -3
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000874 self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000875
876 # Valid positive position
877 handler.pos = 1
Ezio Melottib3aedd42010-11-20 19:04:17 +0000878 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000879
880 # Largest valid positive position (one beyond end of input
881 handler.pos = 2
Ezio Melottib3aedd42010-11-20 19:04:17 +0000882 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000883
884 # Invalid positive position
885 handler.pos = 3
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000886 self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn")
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000887
888 handler.pos = 0
Walter Dörwald30537a42003-01-08 23:22:13 +0000889
890 class D(dict):
891 def __getitem__(self, key):
892 raise ValueError
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000893 for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.posreturn"):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000894 self.assertRaises(UnicodeError, codecs.charmap_encode, "\xff", err, {0xff: None})
895 self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D())
896 self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300})
Walter Dörwald30537a42003-01-08 23:22:13 +0000897
898 def test_translatehelper(self):
899 # enhance coverage of:
900 # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
901 # and callers
902 # (Unfortunately the errors argument is not directly accessible
903 # from Python, so we can't test that much)
904 class D(dict):
905 def __getitem__(self, key):
906 raise ValueError
Georg Brandledbcc132007-10-24 21:25:34 +0000907 #self.assertRaises(ValueError, "\xff".translate, D())
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000908 self.assertRaises(TypeError, "\xff".translate, {0xff: sys.maxunicode+1})
909 self.assertRaises(TypeError, "\xff".translate, {0xff: ()})
Walter Dörwald30537a42003-01-08 23:22:13 +0000910
Walter Dörwald4894c302003-10-24 14:25:28 +0000911 def test_bug828737(self):
912 charmap = {
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000913 ord("&"): "&amp;",
914 ord("<"): "&lt;",
915 ord(">"): "&gt;",
916 ord('"'): "&quot;",
Walter Dörwald4894c302003-10-24 14:25:28 +0000917 }
Tim Peters58eb11c2004-01-18 20:29:55 +0000918
Walter Dörwald4894c302003-10-24 14:25:28 +0000919 for n in (1, 10, 100, 1000):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000920 text = 'abc<def>ghi'*n
Walter Dörwald4894c302003-10-24 14:25:28 +0000921 text.translate(charmap)
922
Walter Dörwalde78178e2007-07-30 13:31:40 +0000923 def test_mutatingdecodehandler(self):
924 baddata = [
925 ("ascii", b"\xff"),
926 ("utf-7", b"++"),
927 ("utf-8", b"\xff"),
928 ("utf-16", b"\xff"),
Walter Dörwald41980ca2007-08-16 21:55:45 +0000929 ("utf-32", b"\xff"),
Walter Dörwalde78178e2007-07-30 13:31:40 +0000930 ("unicode-escape", b"\\u123g"),
931 ("raw-unicode-escape", b"\\u123g"),
932 ("unicode-internal", b"\xff"),
933 ]
934
935 def replacing(exc):
936 if isinstance(exc, UnicodeDecodeError):
937 exc.object = 42
938 return ("\u4242", 0)
939 else:
940 raise TypeError("don't know how to handle %r" % exc)
941 codecs.register_error("test.replacing", replacing)
Ezio Melottiadc417c2011-11-17 12:23:34 +0200942
943 with test.support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +0100944 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +0100945 for (encoding, data) in baddata:
Ezio Melottiadc417c2011-11-17 12:23:34 +0200946 with self.assertRaises(TypeError):
947 data.decode(encoding, "test.replacing")
Walter Dörwalde78178e2007-07-30 13:31:40 +0000948
949 def mutating(exc):
950 if isinstance(exc, UnicodeDecodeError):
951 exc.object[:] = b""
952 return ("\u4242", 0)
953 else:
954 raise TypeError("don't know how to handle %r" % exc)
955 codecs.register_error("test.mutating", mutating)
956 # If the decoder doesn't pick up the modified input the following
957 # will lead to an endless loop
Ezio Melottiadc417c2011-11-17 12:23:34 +0200958 with test.support.check_warnings():
959 # unicode-internal has been deprecated
960 for (encoding, data) in baddata:
961 with self.assertRaises(TypeError):
962 data.decode(encoding, "test.replacing")
Walter Dörwalde78178e2007-07-30 13:31:40 +0000963
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000964
965if __name__ == "__main__":
Brett Cannon3e9a9ae2013-06-12 21:25:59 -0400966 unittest.main()