blob: 1650965a99a76c14c6d25fb3d1ef917c43d0b3f7 [file] [log] [blame]
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001import test.test_support, unittest
2import sys, codecs, htmlentitydefs, unicodedata
3
4class CodecCallbackTest(unittest.TestCase):
5
6 def test_xmlcharrefreplace(self):
7 # replace unencodable characters which numeric character entities.
8 # For ascii, latin-1 and charmaps this is completely implemented
9 # in C and should be reasonably fast.
10 s = u"\u30b9\u30d1\u30e2 \xe4nd eggs"
11 self.assertEqual(
12 s.encode("ascii", "xmlcharrefreplace"),
13 "スパモ änd eggs"
14 )
15 self.assertEqual(
16 s.encode("latin-1", "xmlcharrefreplace"),
17 "スパモ \xe4nd eggs"
18 )
19
20 def test_xmlcharnamereplace(self):
21 # This time use a named character entity for unencodable
22 # characters, if one is available.
23 names = {}
24 for (key, value) in htmlentitydefs.entitydefs.items():
25 if len(value)==1:
26 names[unicode(value, "latin-1")] = unicode(key, "latin-1")
27 else:
28 names[unichr(int(value[2:-1]))] = unicode(key, "latin-1")
29
30 def xmlcharnamereplace(exc):
31 if not isinstance(exc, UnicodeEncodeError):
32 raise TypeError("don't know how to handle %r" % exc)
33 l = []
34 for c in exc.object[exc.start:exc.end]:
35 try:
36 l.append(u"&%s;" % names[c])
37 except KeyError:
38 l.append(u"&#%d;" % ord(c))
39 return (u"".join(l), exc.end)
40
41 codecs.register_error(
42 "test.xmlcharnamereplace", xmlcharnamereplace)
43
44 sin = u"\xab\u211c\xbb = \u2329\u1234\u20ac\u232a"
45 sout = "«ℜ» = ⟨ሴ€⟩"
46 self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout)
47 sout = "\xabℜ\xbb = ⟨ሴ€⟩"
48 self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout)
49 sout = "\xabℜ\xbb = ⟨ሴ\xa4⟩"
50 self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout)
51
52 def test_uninamereplace(self):
53 # We're using the names from the unicode database this time,
54 # and we're doing "systax highlighting" here, i.e. we include
55 # the replaced text in ANSI escape sequences. For this it is
56 # useful that the error handler is not called for every single
57 # unencodable character, but for a complete sequence of
58 # unencodable characters, otherwise we would output many
59 # unneccessary escape sequences.
60
61 def uninamereplace(exc):
62 if not isinstance(exc, UnicodeEncodeError):
63 raise TypeError("don't know how to handle %r" % exc)
64 l = []
65 for c in exc.object[exc.start:exc.end]:
66 l.append(unicodedata.name(c, u"0x%x" % ord(c)))
67 return (u"\033[1m%s\033[0m" % u", ".join(l), exc.end)
68
69 codecs.register_error(
70 "test.uninamereplace", uninamereplace)
71
72 sin = u"\xac\u1234\u20ac\u8000"
73 sout = "\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, 0x8000\033[0m"
74 self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout)
75
76 sout = "\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, 0x8000\033[0m"
77 self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout)
78
79 sout = "\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1m0x8000\033[0m"
80 self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout)
81
82 def test_backslashescape(self):
83 # Does the same as the "unicode-escape" encoding, but with different
84 # base encodings.
85 sin = u"a\xac\u1234\u20ac\u8000"
86 if sys.maxunicode > 0xffff:
87 sin += unichr(sys.maxunicode)
88 sout = "a\\xac\\u1234\\u20ac\\u8000"
89 if sys.maxunicode > 0xffff:
90 sout += "\\U%08x" % sys.maxunicode
91 self.assertEqual(sin.encode("ascii", "backslashreplace"), sout)
92
93 sout = "a\xac\\u1234\\u20ac\\u8000"
94 if sys.maxunicode > 0xffff:
95 sout += "\\U%08x" % sys.maxunicode
96 self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout)
97
98 sout = "a\xac\\u1234\xa4\\u8000"
99 if sys.maxunicode > 0xffff:
100 sout += "\\U%08x" % sys.maxunicode
101 self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
102
103 def test_relaxedutf8(self):
104 # This is the test for a decoding callback handler,
105 # that relaxes the UTF-8 minimal encoding restriction.
106 # A null byte that is encoded as "\xc0\x80" will be
107 # decoded as a null byte. All other illegal sequences
108 # will be handled strictly.
109 def relaxedutf8(exc):
110 if not isinstance(exc, UnicodeDecodeError):
111 raise TypeError("don't know how to handle %r" % exc)
112 if exc.object[exc.start:exc.end].startswith("\xc0\x80"):
113 return (u"\x00", exc.start+2) # retry after two bytes
114 else:
115 raise exc
116
117 codecs.register_error(
118 "test.relaxedutf8", relaxedutf8)
119
120 sin = "a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80"
121 sout = u"a\x00b\x00c\xfc\x00\x00"
122 self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout)
123 sin = "\xc0\x80\xc0\x81"
124 self.assertRaises(UnicodeError, sin.decode, "utf-8", "test.relaxedutf8")
125
126 def test_charmapencode(self):
127 # For charmap encodings the replacement string will be
128 # mapped through the encoding again. This means, that
129 # to be able to use e.g. the "replace" handler, the
130 # charmap has to have a mapping for "?".
131 charmap = dict([ (ord(c), 2*c.upper()) for c in "abcdefgh"])
132 sin = u"abc"
133 sout = "AABBCC"
134 self.assertEquals(codecs.charmap_encode(sin, "strict", charmap)[0], sout)
135
136 sin = u"abcA"
137 self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap)
138
139 charmap[ord("?")] = "XYZ"
140 sin = u"abcDEF"
141 sout = "AABBCCXYZXYZXYZ"
142 self.assertEquals(codecs.charmap_encode(sin, "replace", charmap)[0], sout)
143
144 charmap[ord("?")] = u"XYZ"
145 self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
146
147 charmap[ord("?")] = u"XYZ"
148 self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
149
150 def test_callbacks(self):
151 def handler1(exc):
152 if not isinstance(exc, UnicodeEncodeError) \
153 and not isinstance(exc, UnicodeDecodeError):
154 raise TypeError("don't know how to handle %r" % exc)
155 l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)]
156 return (u"[%s]" % u"".join(l), exc.end)
157
158 codecs.register_error("test.handler1", handler1)
159
160 def handler2(exc):
161 if not isinstance(exc, UnicodeDecodeError):
162 raise TypeError("don't know how to handle %r" % exc)
163 l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)]
164 return (u"[%s]" % u"".join(l), exc.end+1) # skip one character
165
166 codecs.register_error("test.handler2", handler2)
167
168 s = "\x00\x81\x7f\x80\xff"
169
170 self.assertEqual(
171 s.decode("ascii", "test.handler1"),
172 u"\x00[<129>]\x7f[<128>][<255>]"
173 )
174 self.assertEqual(
175 s.decode("ascii", "test.handler2"),
176 u"\x00[<129>][<128>]"
177 )
178
179 self.assertEqual(
180 "\\u3042\u3xxx".decode("unicode-escape", "test.handler1"),
181 u"\u3042[<92><117><51><120>]xx"
182 )
183
184 self.assertEqual(
185 "\\u3042\u3xx".decode("unicode-escape", "test.handler1"),
186 u"\u3042[<92><117><51><120><120>]"
187 )
188
189 self.assertEqual(
190 codecs.charmap_decode("abc", "test.handler1", {ord("a"): u"z"})[0],
191 u"z[<98>][<99>]"
192 )
193
194 self.assertEqual(
195 u"g\xfc\xdfrk".encode("ascii", "test.handler1"),
196 u"g[<252><223>]rk"
197 )
198
199 self.assertEqual(
200 u"g\xfc\xdf".encode("ascii", "test.handler1"),
201 u"g[<252><223>]"
202 )
203
204 def test_longstrings(self):
205 # test long strings to check for memory overflow problems
206 errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"]
207 # register the handlers under different names,
208 # to prevent the codec from recognizing the name
209 for err in errors:
210 codecs.register_error("test." + err, codecs.lookup_error(err))
211 l = 1000
212 errors += [ "test." + err for err in errors ]
213 for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]:
214 for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"):
215 for err in errors:
216 try:
217 uni.encode(enc, err)
218 except UnicodeError:
219 pass
220
221 def check_exceptionobjectargs(self, exctype, args, msg):
222 # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion
223 # check with one missing argument
224 self.assertRaises(TypeError, exctype, *args[:-1])
225 # check with one missing argument
226 self.assertRaises(TypeError, exctype, *(args + ["too much"]))
227 # check with one argument of the wrong type
228 wrongargs = [ "spam", u"eggs", 42, 1.0, None ]
229 for i in xrange(len(args)):
230 for wrongarg in wrongargs:
231 if type(wrongarg) is type(args[i]):
232 continue
233 # build argument array
234 callargs = []
235 for j in xrange(len(args)):
236 if i==j:
237 callargs.append(wrongarg)
238 else:
239 callargs.append(args[i])
240 self.assertRaises(TypeError, exctype, *callargs)
241 exc = exctype(*args)
242 self.assertEquals(str(exc), msg)
243
244 def test_unicodeencodeerror(self):
245 self.check_exceptionobjectargs(
246 UnicodeEncodeError,
247 ["ascii", u"g\xfcrk", 1, 2, "ouch"],
248 "'ascii' codec can't encode character '\ufc' in position 1: ouch"
249 )
250 self.check_exceptionobjectargs(
251 UnicodeEncodeError,
252 ["ascii", u"g\xfcrk", 1, 4, "ouch"],
253 "'ascii' codec can't encode characters in position 1-3: ouch"
254 )
255 self.check_exceptionobjectargs(
256 UnicodeEncodeError,
257 ["ascii", u"\xfcx", 0, 1, "ouch"],
258 "'ascii' codec can't encode character '\ufc' in position 0: ouch"
259 )
260
261 def test_unicodedecodeerror(self):
262 self.check_exceptionobjectargs(
263 UnicodeDecodeError,
264 ["ascii", "g\xfcrk", 1, 2, "ouch"],
265 "'ascii' codec can't decode byte 0xfc in position 1: ouch"
266 )
267 self.check_exceptionobjectargs(
268 UnicodeDecodeError,
269 ["ascii", "g\xfcrk", 1, 3, "ouch"],
270 "'ascii' codec can't decode bytes in position 1-2: ouch"
271 )
272
273 def test_unicodetranslateerror(self):
274 self.check_exceptionobjectargs(
275 UnicodeTranslateError,
276 [u"g\xfcrk", 1, 2, "ouch"],
277 "can't translate character '\\ufc' in position 1: ouch"
278 )
279 self.check_exceptionobjectargs(
280 UnicodeTranslateError,
281 [u"g\xfcrk", 1, 3, "ouch"],
282 "can't translate characters in position 1-2: ouch"
283 )
284
285 def test_badandgoodstrictexceptions(self):
286 self.assertRaises(
287 TypeError,
288 codecs.strict_errors,
289 42
290 )
291 self.assertRaises(
292 Exception,
293 codecs.strict_errors,
294 Exception("ouch")
295 )
296
297 self.assertRaises(
298 UnicodeEncodeError,
299 codecs.strict_errors,
300 UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")
301 )
302
303 def test_badandgoodignoreexceptions(self):
304 self.assertRaises(
305 TypeError,
306 codecs.ignore_errors,
307 42
308 )
309 self.assertRaises(
310 TypeError,
311 codecs.ignore_errors,
312 UnicodeError("ouch")
313 )
314 self.assertEquals(
315 codecs.ignore_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
316 (u"", 1)
317 )
318 self.assertEquals(
319 codecs.ignore_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")),
320 (u"", 1)
321 )
322 self.assertEquals(
323 codecs.ignore_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch")),
324 (u"", 1)
325 )
326
327 def test_badandgoodreplaceexceptions(self):
328 self.assertRaises(
329 TypeError,
330 codecs.replace_errors,
331 42
332 )
333 self.assertRaises(
334 TypeError,
335 codecs.replace_errors,
336 UnicodeError("ouch")
337 )
338 self.assertEquals(
339 codecs.replace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
340 (u"?", 1)
341 )
342 self.assertEquals(
343 codecs.replace_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")),
344 (u"\ufffd", 1)
345 )
346 self.assertEquals(
347 codecs.replace_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch")),
348 (u"\ufffd", 1)
349 )
350
351 def test_badandgoodxmlcharrefreplaceexceptions(self):
352 self.assertRaises(
353 TypeError,
354 codecs.xmlcharrefreplace_errors,
355 42
356 )
357 self.assertRaises(
358 TypeError,
359 codecs.xmlcharrefreplace_errors,
360 UnicodeError("ouch")
361 )
362 self.assertEquals(
363 codecs.xmlcharrefreplace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
364 (u"&#%d;" % 0x3042, 1)
365 )
366 self.assertRaises(
367 TypeError,
368 codecs.xmlcharrefreplace_errors,
369 UnicodeError("ouch")
370 )
371 self.assertRaises(
372 TypeError,
373 codecs.xmlcharrefreplace_errors,
374 UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
375 )
376 self.assertRaises(
377 TypeError,
378 codecs.xmlcharrefreplace_errors,
379 UnicodeTranslateError(u"\u3042", 0, 1, "ouch")
380 )
381
382 def test_badandgoodbackslashreplaceexceptions(self):
383 self.assertRaises(
384 TypeError,
385 codecs.backslashreplace_errors,
386 42
387 )
388 self.assertRaises(
389 TypeError,
390 codecs.backslashreplace_errors,
391 UnicodeError("ouch")
392 )
393 self.assertEquals(
394 codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
395 (u"\\u3042", 1)
396 )
397 self.assertEquals(
398 codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\x00", 0, 1, "ouch")),
399 (u"\\x00", 1)
400 )
401 self.assertEquals(
402 codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\xff", 0, 1, "ouch")),
403 (u"\\xff", 1)
404 )
405 self.assertEquals(
406 codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\u0100", 0, 1, "ouch")),
407 (u"\\u0100", 1)
408 )
409 self.assertEquals(
410 codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\uffff", 0, 1, "ouch")),
411 (u"\\uffff", 1)
412 )
413 if sys.maxunicode>0xffff:
414 self.assertEquals(
415 codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\U00010000", 0, 1, "ouch")),
416 (u"\\U00010000", 1)
417 )
418 self.assertEquals(
419 codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\U0010ffff", 0, 1, "ouch")),
420 (u"\\U0010ffff", 1)
421 )
422
423 self.assertRaises(
424 TypeError,
425 codecs.backslashreplace_errors,
426 UnicodeError("ouch")
427 )
428 self.assertRaises(
429 TypeError,
430 codecs.backslashreplace_errors,
431 UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
432 )
433 self.assertRaises(
434 TypeError,
435 codecs.backslashreplace_errors,
436 UnicodeTranslateError(u"\u3042", 0, 1, "ouch")
437 )
438
439 def test_badhandlerresults(self):
440 results = ( 42, u"foo", (1,2,3), (u"foo", 1, 3), (u"foo", None), (u"foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
441 encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
442
443 for res in results:
444 codecs.register_error("test.badhandler", lambda: res)
445 for enc in encs:
446 self.assertRaises(
447 TypeError,
448 u"\u3042".encode,
449 enc,
450 "test.badhandler"
451 )
452 for (enc, bytes) in (
453 ("ascii", "\xff"),
454 ("utf-8", "\xff"),
455 ("utf-7", "+x-")
456 ):
457 self.assertRaises(
458 TypeError,
459 bytes.decode,
460 enc,
461 "test.badhandler"
462 )
463
464 def test_lookup(self):
465 self.assertEquals(codecs.strict_errors, codecs.lookup_error("strict"))
466 self.assertEquals(codecs.ignore_errors, codecs.lookup_error("ignore"))
467 self.assertEquals(codecs.strict_errors, codecs.lookup_error("strict"))
468 self.assertEquals(
469 codecs.xmlcharrefreplace_errors,
470 codecs.lookup_error("xmlcharrefreplace")
471 )
472 self.assertEquals(
473 codecs.backslashreplace_errors,
474 codecs.lookup_error("backslashreplace")
475 )
476
477def test_main():
478 suite = unittest.TestSuite()
479 suite.addTest(unittest.makeSuite(CodecCallbackTest))
480 test.test_support.run_suite(suite)
481
482if __name__ == "__main__":
483 test_main()