blob: 7c5eb5707fd325cb223064f8c259ce02eb645874 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwald9ae019b2006-03-18 14:22:26 +00004import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
12
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwalde57d7b12004-12-21 22:24:00 +000026class ReadTest(unittest.TestCase):
27 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000028 # get a StreamReader for the encoding and feed the bytestring version
29 # of input to the reader byte by byte. Read every available from
30 # the StreamReader and check that the results equal the appropriate
31 # entries from partialresults.
32 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000033 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000034 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000035 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000036 q.write(c)
37 result += r.read()
38 self.assertEqual(result, partialresult)
39 # check that there's nothing left in the buffers
40 self.assertEqual(r.read(), u"")
41 self.assertEqual(r.bytebuffer, "")
42 self.assertEqual(r.charbuffer, u"")
43
Walter Dörwaldabb02e52006-03-15 11:35:15 +000044 # do the check again, this time using a incremental decoder
45 d = codecs.getincrementaldecoder(self.encoding)()
46 result = u""
47 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
48 result += d.decode(c)
49 self.assertEqual(result, partialresult)
50 # check that there's nothing left in the buffers
51 self.assertEqual(d.decode("", True), u"")
52 self.assertEqual(d.buffer, "")
53
54 # Check whether the rest method works properly
55 d.reset()
56 result = u""
57 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
58 result += d.decode(c)
59 self.assertEqual(result, partialresult)
60 # check that there's nothing left in the buffers
61 self.assertEqual(d.decode("", True), u"")
62 self.assertEqual(d.buffer, "")
63
64 # check iterdecode()
65 encoded = input.encode(self.encoding)
66 self.assertEqual(
67 input,
68 u"".join(codecs.iterdecode(encoded, self.encoding))
69 )
70
Walter Dörwalde57d7b12004-12-21 22:24:00 +000071 def test_readline(self):
72 def getreader(input):
73 stream = StringIO.StringIO(input.encode(self.encoding))
74 return codecs.getreader(self.encoding)(stream)
75
Walter Dörwaldca199432006-03-06 22:39:12 +000076 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 reader = getreader(input)
78 lines = []
79 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000080 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000081 if not line:
82 break
83 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000084 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000085
86 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000087 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
88 sexpectednoends = u"foo|bar|baz|spam|eggs"
89 self.assertEqual(readalllines(s, True), sexpected)
90 self.assertEqual(readalllines(s, False), sexpectednoends)
91 self.assertEqual(readalllines(s, True, 10), sexpected)
92 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000093
94 # Test long lines (multiple calls to read() in readline())
95 vw = []
96 vwo = []
97 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
98 vw.append((i*200)*u"\3042" + lineend)
99 vwo.append((i*200)*u"\3042")
100 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
101 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
102
103 # Test lines where the first read might end with \r, so the
104 # reader has to look ahead whether this is a lone \r or a \r\n
105 for size in xrange(80):
106 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000107 s = 10*(size*u"a" + lineend + u"xxx\n")
108 reader = getreader(s)
109 for i in xrange(10):
110 self.assertEqual(
111 reader.readline(keepends=True),
112 size*u"a" + lineend,
113 )
114 reader = getreader(s)
115 for i in xrange(10):
116 self.assertEqual(
117 reader.readline(keepends=False),
118 size*u"a",
119 )
120
121 def test_bug1175396(self):
122 s = [
123 '<%!--===================================================\r\n',
124 ' BLOG index page: show recent articles,\r\n',
125 ' today\'s articles, or articles of a specific date.\r\n',
126 '========================================================--%>\r\n',
127 '<%@inputencoding="ISO-8859-1"%>\r\n',
128 '<%@pagetemplate=TEMPLATE.y%>\r\n',
129 '<%@import=import frog.util, frog%>\r\n',
130 '<%@import=import frog.objects%>\r\n',
131 '<%@import=from frog.storageerrors import StorageError%>\r\n',
132 '<%\r\n',
133 '\r\n',
134 'import logging\r\n',
135 'log=logging.getLogger("Snakelets.logger")\r\n',
136 '\r\n',
137 '\r\n',
138 'user=self.SessionCtx.user\r\n',
139 'storageEngine=self.SessionCtx.storageEngine\r\n',
140 '\r\n',
141 '\r\n',
142 'def readArticlesFromDate(date, count=None):\r\n',
143 ' entryids=storageEngine.listBlogEntries(date)\r\n',
144 ' entryids.reverse() # descending\r\n',
145 ' if count:\r\n',
146 ' entryids=entryids[:count]\r\n',
147 ' try:\r\n',
148 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
149 ' except StorageError,x:\r\n',
150 ' log.error("Error loading articles: "+str(x))\r\n',
151 ' self.abort("cannot load articles")\r\n',
152 '\r\n',
153 'showdate=None\r\n',
154 '\r\n',
155 'arg=self.Request.getArg()\r\n',
156 'if arg=="today":\r\n',
157 ' #-------------------- TODAY\'S ARTICLES\r\n',
158 ' self.write("<h2>Today\'s articles</h2>")\r\n',
159 ' showdate = frog.util.isodatestr() \r\n',
160 ' entries = readArticlesFromDate(showdate)\r\n',
161 'elif arg=="active":\r\n',
162 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
163 ' self.Yredirect("active.y")\r\n',
164 'elif arg=="login":\r\n',
165 ' #-------------------- LOGIN PAGE redirect\r\n',
166 ' self.Yredirect("login.y")\r\n',
167 'elif arg=="date":\r\n',
168 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
169 ' showdate = self.Request.getParameter("date")\r\n',
170 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
171 ' entries = readArticlesFromDate(showdate)\r\n',
172 'else:\r\n',
173 ' #-------------------- RECENT ARTICLES\r\n',
174 ' self.write("<h2>Recent articles</h2>")\r\n',
175 ' dates=storageEngine.listBlogEntryDates()\r\n',
176 ' if dates:\r\n',
177 ' entries=[]\r\n',
178 ' SHOWAMOUNT=10\r\n',
179 ' for showdate in dates:\r\n',
180 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
181 ' if len(entries)>=SHOWAMOUNT:\r\n',
182 ' break\r\n',
183 ' \r\n',
184 ]
185 stream = StringIO.StringIO("".join(s).encode(self.encoding))
186 reader = codecs.getreader(self.encoding)(stream)
187 for (i, line) in enumerate(reader):
188 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000189
190 def test_readlinequeue(self):
191 q = Queue()
192 writer = codecs.getwriter(self.encoding)(q)
193 reader = codecs.getreader(self.encoding)(q)
194
195 # No lineends
196 writer.write(u"foo\r")
197 self.assertEqual(reader.readline(keepends=False), u"foo")
198 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000199 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000200 self.assertEqual(reader.readline(keepends=False), u"bar")
201 writer.write(u"baz")
202 self.assertEqual(reader.readline(keepends=False), u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"")
204
205 # Lineends
206 writer.write(u"foo\r")
207 self.assertEqual(reader.readline(keepends=True), u"foo\r")
208 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000209 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000210 self.assertEqual(reader.readline(keepends=True), u"bar\r")
211 writer.write(u"baz")
212 self.assertEqual(reader.readline(keepends=True), u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"")
214 writer.write(u"foo\r\n")
215 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
216
Walter Dörwald9fa09462005-01-10 12:01:39 +0000217 def test_bug1098990_a(self):
218 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
219 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
220 s3 = u"next line.\r\n"
221
222 s = (s1+s2+s3).encode(self.encoding)
223 stream = StringIO.StringIO(s)
224 reader = codecs.getreader(self.encoding)(stream)
225 self.assertEqual(reader.readline(), s1)
226 self.assertEqual(reader.readline(), s2)
227 self.assertEqual(reader.readline(), s3)
228 self.assertEqual(reader.readline(), u"")
229
230 def test_bug1098990_b(self):
231 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
232 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
233 s3 = u"stillokay:bbbbxx\r\n"
234 s4 = u"broken!!!!badbad\r\n"
235 s5 = u"againokay.\r\n"
236
237 s = (s1+s2+s3+s4+s5).encode(self.encoding)
238 stream = StringIO.StringIO(s)
239 reader = codecs.getreader(self.encoding)(stream)
240 self.assertEqual(reader.readline(), s1)
241 self.assertEqual(reader.readline(), s2)
242 self.assertEqual(reader.readline(), s3)
243 self.assertEqual(reader.readline(), s4)
244 self.assertEqual(reader.readline(), s5)
245 self.assertEqual(reader.readline(), u"")
246
Walter Dörwald6e390802007-08-17 16:41:28 +0000247class UTF32Test(ReadTest):
248 encoding = "utf-32"
249
250 spamle = ('\xff\xfe\x00\x00'
251 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
252 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
253 spambe = ('\x00\x00\xfe\xff'
254 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
255 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
256
257 def test_only_one_bom(self):
258 _,_,reader,writer = codecs.lookup(self.encoding)
259 # encode some stream
260 s = StringIO.StringIO()
261 f = writer(s)
262 f.write(u"spam")
263 f.write(u"spam")
264 d = s.getvalue()
265 # check whether there is exactly one BOM in it
266 self.assert_(d == self.spamle or d == self.spambe)
267 # try to read it back
268 s = StringIO.StringIO(d)
269 f = reader(s)
270 self.assertEquals(f.read(), u"spamspam")
271
272 def test_badbom(self):
273 s = StringIO.StringIO(4*"\xff")
274 f = codecs.getreader(self.encoding)(s)
275 self.assertRaises(UnicodeError, f.read)
276
277 s = StringIO.StringIO(8*"\xff")
278 f = codecs.getreader(self.encoding)(s)
279 self.assertRaises(UnicodeError, f.read)
280
281 def test_partial(self):
282 self.check_partial(
283 u"\x00\xff\u0100\uffff",
284 [
285 u"", # first byte of BOM read
286 u"", # second byte of BOM read
287 u"", # third byte of BOM read
288 u"", # fourth byte of BOM read => byteorder known
289 u"",
290 u"",
291 u"",
292 u"\x00",
293 u"\x00",
294 u"\x00",
295 u"\x00",
296 u"\x00\xff",
297 u"\x00\xff",
298 u"\x00\xff",
299 u"\x00\xff",
300 u"\x00\xff\u0100",
301 u"\x00\xff\u0100",
302 u"\x00\xff\u0100",
303 u"\x00\xff\u0100",
304 u"\x00\xff\u0100\uffff",
305 ]
306 )
307
308 def test_errors(self):
309 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
310 "\xff", "strict", True)
311
312class UTF32LETest(ReadTest):
313 encoding = "utf-32-le"
314
315 def test_partial(self):
316 self.check_partial(
317 u"\x00\xff\u0100\uffff",
318 [
319 u"",
320 u"",
321 u"",
322 u"\x00",
323 u"\x00",
324 u"\x00",
325 u"\x00",
326 u"\x00\xff",
327 u"\x00\xff",
328 u"\x00\xff",
329 u"\x00\xff",
330 u"\x00\xff\u0100",
331 u"\x00\xff\u0100",
332 u"\x00\xff\u0100",
333 u"\x00\xff\u0100",
334 u"\x00\xff\u0100\uffff",
335 ]
336 )
337
338 def test_simple(self):
339 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
340
341 def test_errors(self):
342 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
343 "\xff", "strict", True)
344
345class UTF32BETest(ReadTest):
346 encoding = "utf-32-be"
347
348 def test_partial(self):
349 self.check_partial(
350 u"\x00\xff\u0100\uffff",
351 [
352 u"",
353 u"",
354 u"",
355 u"\x00",
356 u"\x00",
357 u"\x00",
358 u"\x00",
359 u"\x00\xff",
360 u"\x00\xff",
361 u"\x00\xff",
362 u"\x00\xff",
363 u"\x00\xff\u0100",
364 u"\x00\xff\u0100",
365 u"\x00\xff\u0100",
366 u"\x00\xff\u0100",
367 u"\x00\xff\u0100\uffff",
368 ]
369 )
370
371 def test_simple(self):
372 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
373
374 def test_errors(self):
375 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
376 "\xff", "strict", True)
377
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000378class UTF16Test(ReadTest):
379 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000380
381 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
382 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
383
384 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000385 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000386 # encode some stream
387 s = StringIO.StringIO()
388 f = writer(s)
389 f.write(u"spam")
390 f.write(u"spam")
391 d = s.getvalue()
392 # check whether there is exactly one BOM in it
393 self.assert_(d == self.spamle or d == self.spambe)
394 # try to read it back
395 s = StringIO.StringIO(d)
396 f = reader(s)
397 self.assertEquals(f.read(), u"spamspam")
398
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000399 def test_badbom(self):
400 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000401 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000402 self.assertRaises(UnicodeError, f.read)
403
404 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000405 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000406 self.assertRaises(UnicodeError, f.read)
407
Walter Dörwald69652032004-09-07 20:24:22 +0000408 def test_partial(self):
409 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000410 u"\x00\xff\u0100\uffff",
411 [
412 u"", # first byte of BOM read
413 u"", # second byte of BOM read => byteorder known
414 u"",
415 u"\x00",
416 u"\x00",
417 u"\x00\xff",
418 u"\x00\xff",
419 u"\x00\xff\u0100",
420 u"\x00\xff\u0100",
421 u"\x00\xff\u0100\uffff",
422 ]
423 )
424
Walter Dörwalde22d3392005-11-17 08:52:34 +0000425 def test_errors(self):
426 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
427
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000428class UTF16LETest(ReadTest):
429 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000430
431 def test_partial(self):
432 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000433 u"\x00\xff\u0100\uffff",
434 [
435 u"",
436 u"\x00",
437 u"\x00",
438 u"\x00\xff",
439 u"\x00\xff",
440 u"\x00\xff\u0100",
441 u"\x00\xff\u0100",
442 u"\x00\xff\u0100\uffff",
443 ]
444 )
445
Walter Dörwalde22d3392005-11-17 08:52:34 +0000446 def test_errors(self):
447 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
448
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000449class UTF16BETest(ReadTest):
450 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000451
452 def test_partial(self):
453 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000454 u"\x00\xff\u0100\uffff",
455 [
456 u"",
457 u"\x00",
458 u"\x00",
459 u"\x00\xff",
460 u"\x00\xff",
461 u"\x00\xff\u0100",
462 u"\x00\xff\u0100",
463 u"\x00\xff\u0100\uffff",
464 ]
465 )
466
Walter Dörwalde22d3392005-11-17 08:52:34 +0000467 def test_errors(self):
468 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
469
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000470class UTF8Test(ReadTest):
471 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000472
473 def test_partial(self):
474 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000475 u"\x00\xff\u07ff\u0800\uffff",
476 [
477 u"\x00",
478 u"\x00",
479 u"\x00\xff",
480 u"\x00\xff",
481 u"\x00\xff\u07ff",
482 u"\x00\xff\u07ff",
483 u"\x00\xff\u07ff",
484 u"\x00\xff\u07ff\u0800",
485 u"\x00\xff\u07ff\u0800",
486 u"\x00\xff\u07ff\u0800",
487 u"\x00\xff\u07ff\u0800\uffff",
488 ]
489 )
490
Walter Dörwalde22d3392005-11-17 08:52:34 +0000491class UTF7Test(ReadTest):
492 encoding = "utf-7"
493
494 # No test_partial() yet, because UTF-7 doesn't support it.
495
496class UTF16ExTest(unittest.TestCase):
497
498 def test_errors(self):
499 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
500
501 def test_bad_args(self):
502 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
503
504class ReadBufferTest(unittest.TestCase):
505
506 def test_array(self):
507 import array
508 self.assertEqual(
509 codecs.readbuffer_encode(array.array("c", "spam")),
510 ("spam", 4)
511 )
512
513 def test_empty(self):
514 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
515
516 def test_bad_args(self):
517 self.assertRaises(TypeError, codecs.readbuffer_encode)
518 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
519
520class CharBufferTest(unittest.TestCase):
521
522 def test_string(self):
523 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
524
525 def test_empty(self):
526 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
527
528 def test_bad_args(self):
529 self.assertRaises(TypeError, codecs.charbuffer_encode)
530 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
531
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000532class UTF8SigTest(ReadTest):
533 encoding = "utf-8-sig"
534
535 def test_partial(self):
536 self.check_partial(
537 u"\ufeff\x00\xff\u07ff\u0800\uffff",
538 [
539 u"",
540 u"",
541 u"", # First BOM has been read and skipped
542 u"",
543 u"",
544 u"\ufeff", # Second BOM has been read and emitted
545 u"\ufeff\x00", # "\x00" read and emitted
546 u"\ufeff\x00", # First byte of encoded u"\xff" read
547 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
548 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
549 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
550 u"\ufeff\x00\xff\u07ff",
551 u"\ufeff\x00\xff\u07ff",
552 u"\ufeff\x00\xff\u07ff\u0800",
553 u"\ufeff\x00\xff\u07ff\u0800",
554 u"\ufeff\x00\xff\u07ff\u0800",
555 u"\ufeff\x00\xff\u07ff\u0800\uffff",
556 ]
557 )
558
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000559 def test_bug1601501(self):
560 # SF bug #1601501: check that the codec works with a buffer
561 unicode("\xef\xbb\xbf", "utf-8-sig")
562
Walter Dörwald42348272007-04-12 10:35:00 +0000563 def test_bom(self):
564 d = codecs.getincrementaldecoder("utf-8-sig")()
565 s = u"spam"
566 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
567
Walter Dörwald8709a422002-09-03 13:53:40 +0000568class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000569 def test_empty(self):
Walter Dörwald8709a422002-09-03 13:53:40 +0000570 self.assertEquals(codecs.escape_decode(""), ("", 0))
571
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000572class RecodingTest(unittest.TestCase):
573 def test_recoding(self):
574 f = StringIO.StringIO()
575 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
576 f2.write(u"a")
577 f2.close()
578 # Python used to crash on this at exit because of a refcount
579 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000580
Martin v. Löwis2548c732003-04-18 10:39:54 +0000581# From RFC 3492
582punycode_testcases = [
583 # A Arabic (Egyptian):
584 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
585 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
586 "egbpdaj6bu4bxfgehfvwxn"),
587 # B Chinese (simplified):
588 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
589 "ihqwcrb4cv8a8dqg056pqjye"),
590 # C Chinese (traditional):
591 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
592 "ihqwctvzc91f659drss3x8bo0yb"),
593 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
594 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
595 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
596 u"\u0065\u0073\u006B\u0079",
597 "Proprostnemluvesky-uyb24dma41a"),
598 # E Hebrew:
599 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
600 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
601 u"\u05D1\u05E8\u05D9\u05EA",
602 "4dbcagdahymbxekheh6e0a7fei0b"),
603 # F Hindi (Devanagari):
604 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
605 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
606 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
607 u"\u0939\u0948\u0902",
608 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
609
610 #(G) Japanese (kanji and hiragana):
611 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
612 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
613 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
614
615 # (H) Korean (Hangul syllables):
616 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
617 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
618 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
619 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
620 "psd879ccm6fea98c"),
621
622 # (I) Russian (Cyrillic):
623 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
624 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
625 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
626 u"\u0438",
627 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
628
629 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
630 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
631 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
632 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
633 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
634 u"\u0061\u00F1\u006F\u006C",
635 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
636
637 # (K) Vietnamese:
638 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
639 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
640 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
641 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
642 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
643 u"\u0056\u0069\u1EC7\u0074",
644 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
645
Martin v. Löwis2548c732003-04-18 10:39:54 +0000646 #(L) 3<nen>B<gumi><kinpachi><sensei>
647 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
648 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000649
Martin v. Löwis2548c732003-04-18 10:39:54 +0000650 # (M) <amuro><namie>-with-SUPER-MONKEYS
651 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
652 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
653 u"\u004F\u004E\u004B\u0045\u0059\u0053",
654 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
655
656 # (N) Hello-Another-Way-<sorezore><no><basho>
657 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
658 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
659 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
660 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
661
662 # (O) <hitotsu><yane><no><shita>2
663 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
664 "2-u9tlzr9756bt3uc0v"),
665
666 # (P) Maji<de>Koi<suru>5<byou><mae>
667 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
668 u"\u308B\u0035\u79D2\u524D",
669 "MajiKoi5-783gue6qz075azm5e"),
670
671 # (Q) <pafii>de<runba>
672 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
673 "de-jg4avhby1noc0d"),
674
675 # (R) <sono><supiido><de>
676 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
677 "d9juau41awczczp"),
678
679 # (S) -> $1.00 <-
680 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
681 u"\u003C\u002D",
682 "-> $1.00 <--")
683 ]
684
685for i in punycode_testcases:
686 if len(i)!=2:
687 print repr(i)
688
689class PunycodeTest(unittest.TestCase):
690 def test_encode(self):
691 for uni, puny in punycode_testcases:
692 # Need to convert both strings to lower case, since
693 # some of the extended encodings use upper case, but our
694 # code produces only lower case. Converting just puny to
695 # lower is also insufficient, since some of the input characters
696 # are upper case.
697 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
698
699 def test_decode(self):
700 for uni, puny in punycode_testcases:
701 self.assertEquals(uni, puny.decode("punycode"))
702
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000703class UnicodeInternalTest(unittest.TestCase):
704 def test_bug1251300(self):
705 # Decoding with unicode_internal used to not correctly handle "code
706 # points" above 0x10ffff on UCS-4 builds.
707 if sys.maxunicode > 0xffff:
708 ok = [
709 ("\x00\x10\xff\xff", u"\U0010ffff"),
710 ("\x00\x00\x01\x01", u"\U00000101"),
711 ("", u""),
712 ]
713 not_ok = [
714 "\x7f\xff\xff\xff",
715 "\x80\x00\x00\x00",
716 "\x81\x00\x00\x00",
717 "\x00",
718 "\x00\x00\x00\x00\x00",
719 ]
720 for internal, uni in ok:
721 if sys.byteorder == "little":
722 internal = "".join(reversed(internal))
723 self.assertEquals(uni, internal.decode("unicode_internal"))
724 for internal in not_ok:
725 if sys.byteorder == "little":
726 internal = "".join(reversed(internal))
727 self.assertRaises(UnicodeDecodeError, internal.decode,
728 "unicode_internal")
729
730 def test_decode_error_attributes(self):
731 if sys.maxunicode > 0xffff:
732 try:
733 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
734 except UnicodeDecodeError, ex:
735 self.assertEquals("unicode_internal", ex.encoding)
736 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
737 self.assertEquals(4, ex.start)
738 self.assertEquals(8, ex.end)
739 else:
740 self.fail()
741
742 def test_decode_callback(self):
743 if sys.maxunicode > 0xffff:
744 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
745 decoder = codecs.getdecoder("unicode_internal")
746 ab = u"ab".encode("unicode_internal")
747 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
748 "UnicodeInternalTest")
749 self.assertEquals((u"ab", 12), ignored)
750
Martin v. Löwis2548c732003-04-18 10:39:54 +0000751# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
752nameprep_tests = [
753 # 3.1 Map to nothing.
754 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
755 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
756 '\xb8\x8f\xef\xbb\xbf',
757 'foobarbaz'),
758 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
759 ('CAFE',
760 'cafe'),
761 # 3.3 Case folding 8bit U+00DF (german sharp s).
762 # The original test case is bogus; it says \xc3\xdf
763 ('\xc3\x9f',
764 'ss'),
765 # 3.4 Case folding U+0130 (turkish capital I with dot).
766 ('\xc4\xb0',
767 'i\xcc\x87'),
768 # 3.5 Case folding multibyte U+0143 U+037A.
769 ('\xc5\x83\xcd\xba',
770 '\xc5\x84 \xce\xb9'),
771 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
772 # XXX: skip this as it fails in UCS-2 mode
773 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
774 # 'telc\xe2\x88\x95kg\xcf\x83'),
775 (None, None),
776 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
777 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
778 '\xc7\xb0 a'),
779 # 3.8 Case folding U+1FB7 and normalization.
780 ('\xe1\xbe\xb7',
781 '\xe1\xbe\xb6\xce\xb9'),
782 # 3.9 Self-reverting case folding U+01F0 and normalization.
783 # The original test case is bogus, it says `\xc7\xf0'
784 ('\xc7\xb0',
785 '\xc7\xb0'),
786 # 3.10 Self-reverting case folding U+0390 and normalization.
787 ('\xce\x90',
788 '\xce\x90'),
789 # 3.11 Self-reverting case folding U+03B0 and normalization.
790 ('\xce\xb0',
791 '\xce\xb0'),
792 # 3.12 Self-reverting case folding U+1E96 and normalization.
793 ('\xe1\xba\x96',
794 '\xe1\xba\x96'),
795 # 3.13 Self-reverting case folding U+1F56 and normalization.
796 ('\xe1\xbd\x96',
797 '\xe1\xbd\x96'),
798 # 3.14 ASCII space character U+0020.
799 (' ',
800 ' '),
801 # 3.15 Non-ASCII 8bit space character U+00A0.
802 ('\xc2\xa0',
803 ' '),
804 # 3.16 Non-ASCII multibyte space character U+1680.
805 ('\xe1\x9a\x80',
806 None),
807 # 3.17 Non-ASCII multibyte space character U+2000.
808 ('\xe2\x80\x80',
809 ' '),
810 # 3.18 Zero Width Space U+200b.
811 ('\xe2\x80\x8b',
812 ''),
813 # 3.19 Non-ASCII multibyte space character U+3000.
814 ('\xe3\x80\x80',
815 ' '),
816 # 3.20 ASCII control characters U+0010 U+007F.
817 ('\x10\x7f',
818 '\x10\x7f'),
819 # 3.21 Non-ASCII 8bit control character U+0085.
820 ('\xc2\x85',
821 None),
822 # 3.22 Non-ASCII multibyte control character U+180E.
823 ('\xe1\xa0\x8e',
824 None),
825 # 3.23 Zero Width No-Break Space U+FEFF.
826 ('\xef\xbb\xbf',
827 ''),
828 # 3.24 Non-ASCII control character U+1D175.
829 ('\xf0\x9d\x85\xb5',
830 None),
831 # 3.25 Plane 0 private use character U+F123.
832 ('\xef\x84\xa3',
833 None),
834 # 3.26 Plane 15 private use character U+F1234.
835 ('\xf3\xb1\x88\xb4',
836 None),
837 # 3.27 Plane 16 private use character U+10F234.
838 ('\xf4\x8f\x88\xb4',
839 None),
840 # 3.28 Non-character code point U+8FFFE.
841 ('\xf2\x8f\xbf\xbe',
842 None),
843 # 3.29 Non-character code point U+10FFFF.
844 ('\xf4\x8f\xbf\xbf',
845 None),
846 # 3.30 Surrogate code U+DF42.
847 ('\xed\xbd\x82',
848 None),
849 # 3.31 Non-plain text character U+FFFD.
850 ('\xef\xbf\xbd',
851 None),
852 # 3.32 Ideographic description character U+2FF5.
853 ('\xe2\xbf\xb5',
854 None),
855 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000856 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000857 '\xcc\x81'),
858 # 3.34 Left-to-right mark U+200E.
859 ('\xe2\x80\x8e',
860 None),
861 # 3.35 Deprecated U+202A.
862 ('\xe2\x80\xaa',
863 None),
864 # 3.36 Language tagging character U+E0001.
865 ('\xf3\xa0\x80\x81',
866 None),
867 # 3.37 Language tagging character U+E0042.
868 ('\xf3\xa0\x81\x82',
869 None),
870 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
871 ('foo\xd6\xbebar',
872 None),
873 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
874 ('foo\xef\xb5\x90bar',
875 None),
876 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
877 ('foo\xef\xb9\xb6bar',
878 'foo \xd9\x8ebar'),
879 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
880 ('\xd8\xa71',
881 None),
882 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
883 ('\xd8\xa71\xd8\xa8',
884 '\xd8\xa71\xd8\xa8'),
885 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000886 # Skip this test as we allow unassigned
887 #('\xf3\xa0\x80\x82',
888 # None),
889 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000890 # 3.44 Larger test (shrinking).
891 # Original test case reads \xc3\xdf
892 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
893 '\xaa\xce\xb0\xe2\x80\x80',
894 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
895 # 3.45 Larger test (expanding).
896 # Original test case reads \xc3\x9f
897 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
898 '\x80',
899 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
900 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
901 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
902 ]
903
904
905class NameprepTest(unittest.TestCase):
906 def test_nameprep(self):
907 from encodings.idna import nameprep
908 for pos, (orig, prepped) in enumerate(nameprep_tests):
909 if orig is None:
910 # Skipped
911 continue
912 # The Unicode strings are given in UTF-8
913 orig = unicode(orig, "utf-8")
914 if prepped is None:
915 # Input contains prohibited characters
916 self.assertRaises(UnicodeError, nameprep, orig)
917 else:
918 prepped = unicode(prepped, "utf-8")
919 try:
920 self.assertEquals(nameprep(orig), prepped)
921 except Exception,e:
922 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
923
Walter Dörwald78a0be62006-04-14 18:25:39 +0000924class IDNACodecTest(unittest.TestCase):
925 def test_builtin_decode(self):
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000926 self.assertEquals(unicode("python.org", "idna"), u"python.org")
Walter Dörwald78a0be62006-04-14 18:25:39 +0000927 self.assertEquals(unicode("python.org.", "idna"), u"python.org.")
928 self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
929 self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
930
931 def test_builtin_encode(self):
932 self.assertEquals(u"python.org".encode("idna"), "python.org")
933 self.assertEquals("python.org.".encode("idna"), "python.org.")
934 self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
935 self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000936
Martin v. Löwis8b595142005-08-25 11:03:38 +0000937 def test_stream(self):
938 import StringIO
939 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
940 r.read(3)
941 self.assertEquals(r.read(), u"")
942
Walter Dörwald78a0be62006-04-14 18:25:39 +0000943 def test_incremental_decode(self):
944 self.assertEquals(
945 "".join(codecs.iterdecode("python.org", "idna")),
946 u"python.org"
947 )
948 self.assertEquals(
949 "".join(codecs.iterdecode("python.org.", "idna")),
950 u"python.org."
951 )
952 self.assertEquals(
953 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
954 u"pyth\xf6n.org."
955 )
956 self.assertEquals(
957 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
958 u"pyth\xf6n.org."
959 )
960
961 decoder = codecs.getincrementaldecoder("idna")()
962 self.assertEquals(decoder.decode("xn--xam", ), u"")
963 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
964 self.assertEquals(decoder.decode(u"rg"), u"")
965 self.assertEquals(decoder.decode(u"", True), u"org")
966
967 decoder.reset()
968 self.assertEquals(decoder.decode("xn--xam", ), u"")
969 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
970 self.assertEquals(decoder.decode("rg."), u"org.")
971 self.assertEquals(decoder.decode("", True), u"")
972
973 def test_incremental_encode(self):
974 self.assertEquals(
975 "".join(codecs.iterencode(u"python.org", "idna")),
976 "python.org"
977 )
978 self.assertEquals(
979 "".join(codecs.iterencode(u"python.org.", "idna")),
980 "python.org."
981 )
982 self.assertEquals(
983 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
984 "xn--pythn-mua.org."
985 )
986 self.assertEquals(
987 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
988 "xn--pythn-mua.org."
989 )
990
991 encoder = codecs.getincrementalencoder("idna")()
992 self.assertEquals(encoder.encode(u"\xe4x"), "")
993 self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
994 self.assertEquals(encoder.encode(u"", True), "org")
995
996 encoder.reset()
997 self.assertEquals(encoder.encode(u"\xe4x"), "")
998 self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
999 self.assertEquals(encoder.encode(u"", True), "")
1000
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001001class CodecsModuleTest(unittest.TestCase):
1002
1003 def test_decode(self):
1004 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
1005 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001006 self.assertRaises(TypeError, codecs.decode)
1007 self.assertEquals(codecs.decode('abc'), u'abc')
1008 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1009
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001010 def test_encode(self):
1011 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
1012 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001013 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001014 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwald063e1e82004-10-28 13:04:26 +00001015 self.assertEquals(codecs.encode(u'abc'), 'abc')
1016 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1017
1018 def test_register(self):
1019 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001020 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001021
1022 def test_lookup(self):
1023 self.assertRaises(TypeError, codecs.lookup)
1024 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001025 self.assertRaises(LookupError, codecs.lookup, " ")
1026
1027 def test_getencoder(self):
1028 self.assertRaises(TypeError, codecs.getencoder)
1029 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1030
1031 def test_getdecoder(self):
1032 self.assertRaises(TypeError, codecs.getdecoder)
1033 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1034
1035 def test_getreader(self):
1036 self.assertRaises(TypeError, codecs.getreader)
1037 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1038
1039 def test_getwriter(self):
1040 self.assertRaises(TypeError, codecs.getwriter)
1041 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001042
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001043class StreamReaderTest(unittest.TestCase):
1044
1045 def setUp(self):
1046 self.reader = codecs.getreader('utf-8')
1047 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1048
1049 def test_readlines(self):
1050 f = self.reader(self.stream)
1051 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
1052
Georg Brandl8f99f812006-10-29 08:39:22 +00001053class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001054
Georg Brandl8f99f812006-10-29 08:39:22 +00001055 def test_basic(self):
1056 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001057 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
1058 self.assertEquals(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001059
1060 f = StringIO.StringIO()
1061 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1062 ef.write('\xc3\xbc')
1063 self.assertEquals(f.getvalue(), '\xfc')
1064
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001065class Str2StrTest(unittest.TestCase):
1066
1067 def test_read(self):
1068 sin = "\x80".encode("base64_codec")
1069 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1070 sout = reader.read()
1071 self.assertEqual(sout, "\x80")
1072 self.assert_(isinstance(sout, str))
1073
1074 def test_readline(self):
1075 sin = "\x80".encode("base64_codec")
1076 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1077 sout = reader.readline()
1078 self.assertEqual(sout, "\x80")
1079 self.assert_(isinstance(sout, str))
1080
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001081all_unicode_encodings = [
1082 "ascii",
1083 "base64_codec",
1084 "big5",
1085 "big5hkscs",
1086 "charmap",
1087 "cp037",
1088 "cp1006",
1089 "cp1026",
1090 "cp1140",
1091 "cp1250",
1092 "cp1251",
1093 "cp1252",
1094 "cp1253",
1095 "cp1254",
1096 "cp1255",
1097 "cp1256",
1098 "cp1257",
1099 "cp1258",
1100 "cp424",
1101 "cp437",
1102 "cp500",
1103 "cp737",
1104 "cp775",
1105 "cp850",
1106 "cp852",
1107 "cp855",
1108 "cp856",
1109 "cp857",
1110 "cp860",
1111 "cp861",
1112 "cp862",
1113 "cp863",
1114 "cp864",
1115 "cp865",
1116 "cp866",
1117 "cp869",
1118 "cp874",
1119 "cp875",
1120 "cp932",
1121 "cp949",
1122 "cp950",
1123 "euc_jis_2004",
1124 "euc_jisx0213",
1125 "euc_jp",
1126 "euc_kr",
1127 "gb18030",
1128 "gb2312",
1129 "gbk",
1130 "hex_codec",
1131 "hp_roman8",
1132 "hz",
1133 "idna",
1134 "iso2022_jp",
1135 "iso2022_jp_1",
1136 "iso2022_jp_2",
1137 "iso2022_jp_2004",
1138 "iso2022_jp_3",
1139 "iso2022_jp_ext",
1140 "iso2022_kr",
1141 "iso8859_1",
1142 "iso8859_10",
1143 "iso8859_11",
1144 "iso8859_13",
1145 "iso8859_14",
1146 "iso8859_15",
1147 "iso8859_16",
1148 "iso8859_2",
1149 "iso8859_3",
1150 "iso8859_4",
1151 "iso8859_5",
1152 "iso8859_6",
1153 "iso8859_7",
1154 "iso8859_8",
1155 "iso8859_9",
1156 "johab",
1157 "koi8_r",
1158 "koi8_u",
1159 "latin_1",
1160 "mac_cyrillic",
1161 "mac_greek",
1162 "mac_iceland",
1163 "mac_latin2",
1164 "mac_roman",
1165 "mac_turkish",
1166 "palmos",
1167 "ptcp154",
1168 "punycode",
1169 "raw_unicode_escape",
1170 "rot_13",
1171 "shift_jis",
1172 "shift_jis_2004",
1173 "shift_jisx0213",
1174 "tis_620",
1175 "unicode_escape",
1176 "unicode_internal",
1177 "utf_16",
1178 "utf_16_be",
1179 "utf_16_le",
1180 "utf_7",
1181 "utf_8",
1182]
1183
1184if hasattr(codecs, "mbcs_encode"):
1185 all_unicode_encodings.append("mbcs")
1186
1187# The following encodings work only with str, not unicode
1188all_string_encodings = [
1189 "quopri_codec",
1190 "string_escape",
1191 "uu_codec",
1192]
1193
1194# The following encoding is not tested, because it's not supposed
1195# to work:
1196# "undefined"
1197
1198# The following encodings don't work in stateful mode
1199broken_unicode_with_streams = [
1200 "base64_codec",
1201 "hex_codec",
1202 "punycode",
1203 "unicode_internal"
1204]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001205broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001206
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001207# The following encodings only support "strict" mode
1208only_strict_mode = [
1209 "idna",
1210 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001211 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001212]
1213
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001214try:
1215 import bz2
1216except ImportError:
1217 pass
1218else:
1219 all_unicode_encodings.append("bz2_codec")
1220 broken_unicode_with_streams.append("bz2_codec")
1221
1222try:
1223 import zlib
1224except ImportError:
1225 pass
1226else:
1227 all_unicode_encodings.append("zlib_codec")
1228 broken_unicode_with_streams.append("zlib_codec")
1229
1230class BasicUnicodeTest(unittest.TestCase):
1231 def test_basics(self):
1232 s = u"abc123" # all codecs should be able to encode these
1233 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001234 name = codecs.lookup(encoding).name
1235 if encoding.endswith("_codec"):
1236 name += "_codec"
1237 elif encoding == "latin_1":
1238 name = "latin_1"
1239 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001240 (bytes, size) = codecs.getencoder(encoding)(s)
1241 if encoding != "unicode_internal":
1242 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1243 (chars, size) = codecs.getdecoder(encoding)(bytes)
1244 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1245
1246 if encoding not in broken_unicode_with_streams:
1247 # check stream reader/writer
1248 q = Queue()
1249 writer = codecs.getwriter(encoding)(q)
1250 encodedresult = ""
1251 for c in s:
1252 writer.write(c)
1253 encodedresult += q.read()
1254 q = Queue()
1255 reader = codecs.getreader(encoding)(q)
1256 decodedresult = u""
1257 for c in encodedresult:
1258 q.write(c)
1259 decodedresult += reader.read()
1260 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1261
Georg Brandl2c9838e2006-10-29 14:39:09 +00001262 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001263 # check incremental decoder/encoder (fetched via the Python
1264 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001265 try:
1266 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001267 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001268 except LookupError: # no IncrementalEncoder
1269 pass
1270 else:
1271 # check incremental decoder/encoder
1272 encodedresult = ""
1273 for c in s:
1274 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001275 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001276 decoder = codecs.getincrementaldecoder(encoding)()
1277 decodedresult = u""
1278 for c in encodedresult:
1279 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001280 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001281 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1282
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001283 # check C API
1284 encodedresult = ""
1285 for c in s:
1286 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001287 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001288 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1289 decodedresult = u""
1290 for c in encodedresult:
1291 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001292 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001293 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1294
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001295 # check iterencode()/iterdecode()
1296 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1297 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1298
1299 # check iterencode()/iterdecode() with empty string
1300 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1301 self.assertEqual(result, u"")
1302
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001303 if encoding not in only_strict_mode:
1304 # check incremental decoder/encoder with errors argument
1305 try:
1306 encoder = codecs.getincrementalencoder(encoding)("ignore")
1307 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1308 except LookupError: # no IncrementalEncoder
1309 pass
1310 else:
1311 encodedresult = "".join(encoder.encode(c) for c in s)
1312 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1313 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1314 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
Tim Petersabd8a332006-11-03 02:32:46 +00001315
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001316 encodedresult = "".join(cencoder.encode(c) for c in s)
1317 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1318 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1319 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1320
Walter Dörwald729c31f2005-03-14 19:06:30 +00001321 def test_seek(self):
1322 # all codecs should be able to encode these
1323 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1324 for encoding in all_unicode_encodings:
1325 if encoding == "idna": # FIXME: See SF bug #1163178
1326 continue
1327 if encoding in broken_unicode_with_streams:
1328 continue
1329 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1330 for t in xrange(5):
1331 # Test that calling seek resets the internal codec state and buffers
1332 reader.seek(0, 0)
1333 line = reader.readline()
1334 self.assertEqual(s[:len(line)], line)
1335
Walter Dörwalde22d3392005-11-17 08:52:34 +00001336 def test_bad_decode_args(self):
1337 for encoding in all_unicode_encodings:
1338 decoder = codecs.getdecoder(encoding)
1339 self.assertRaises(TypeError, decoder)
1340 if encoding not in ("idna", "punycode"):
1341 self.assertRaises(TypeError, decoder, 42)
1342
1343 def test_bad_encode_args(self):
1344 for encoding in all_unicode_encodings:
1345 encoder = codecs.getencoder(encoding)
1346 self.assertRaises(TypeError, encoder)
1347
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001348 def test_encoding_map_type_initialized(self):
1349 from encodings import cp1140
1350 # This used to crash, we are only verifying there's no crash.
1351 table_type = type(cp1140.encoding_table)
1352 self.assertEqual(table_type, table_type)
1353
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001354class BasicStrTest(unittest.TestCase):
1355 def test_basics(self):
1356 s = "abc123"
1357 for encoding in all_string_encodings:
1358 (bytes, size) = codecs.getencoder(encoding)(s)
1359 self.assertEqual(size, len(s))
1360 (chars, size) = codecs.getdecoder(encoding)(bytes)
1361 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1362
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001363class CharmapTest(unittest.TestCase):
1364 def test_decode_with_string_map(self):
1365 self.assertEquals(
1366 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1367 (u"abc", 3)
1368 )
1369
1370 self.assertEquals(
1371 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1372 (u"ab\ufffd", 3)
1373 )
1374
1375 self.assertEquals(
1376 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1377 (u"ab\ufffd", 3)
1378 )
1379
1380 self.assertEquals(
1381 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1382 (u"ab", 3)
1383 )
1384
1385 self.assertEquals(
1386 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1387 (u"ab", 3)
1388 )
1389
1390 allbytes = "".join(chr(i) for i in xrange(256))
1391 self.assertEquals(
1392 codecs.charmap_decode(allbytes, "ignore", u""),
1393 (u"", len(allbytes))
1394 )
1395
Georg Brandl8f99f812006-10-29 08:39:22 +00001396class WithStmtTest(unittest.TestCase):
1397 def test_encodedfile(self):
1398 f = StringIO.StringIO("\xc3\xbc")
1399 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1400 self.assertEquals(ef.read(), "\xfc")
1401
1402 def test_streamreaderwriter(self):
1403 f = StringIO.StringIO("\xc3\xbc")
1404 info = codecs.lookup("utf-8")
1405 with codecs.StreamReaderWriter(f, info.streamreader,
1406 info.streamwriter, 'strict') as srw:
1407 self.assertEquals(srw.read(), u"\xfc")
1408
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001409
Fred Drake2e2be372001-09-20 21:33:42 +00001410def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001411 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00001412 UTF32Test,
1413 UTF32LETest,
1414 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001415 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001416 UTF16LETest,
1417 UTF16BETest,
1418 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001419 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001420 UTF7Test,
1421 UTF16ExTest,
1422 ReadBufferTest,
1423 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001424 EscapeDecodeTest,
1425 RecodingTest,
1426 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001427 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001428 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00001429 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001430 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001431 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001432 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001433 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001434 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001435 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001436 CharmapTest,
1437 WithStmtTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001438 )
Fred Drake2e2be372001-09-20 21:33:42 +00001439
1440
1441if __name__ == "__main__":
1442 test_main()