blob: bfb417ca4325bac47f7a968c1b2fe7fe45aa7f2b [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwald9ae019b2006-03-18 14:22:26 +00004import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
12
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwalde57d7b12004-12-21 22:24:00 +000026class ReadTest(unittest.TestCase):
27 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000028 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000029 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000030 # the StreamReader and check that the results equal the appropriate
31 # entries from partialresults.
32 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000033 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000034 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000035 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000036 q.write(c)
37 result += r.read()
38 self.assertEqual(result, partialresult)
39 # check that there's nothing left in the buffers
40 self.assertEqual(r.read(), u"")
41 self.assertEqual(r.bytebuffer, "")
42 self.assertEqual(r.charbuffer, u"")
43
Walter Dörwaldabb02e52006-03-15 11:35:15 +000044 # do the check again, this time using a incremental decoder
45 d = codecs.getincrementaldecoder(self.encoding)()
46 result = u""
47 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
48 result += d.decode(c)
49 self.assertEqual(result, partialresult)
50 # check that there's nothing left in the buffers
51 self.assertEqual(d.decode("", True), u"")
52 self.assertEqual(d.buffer, "")
53
54 # Check whether the rest method works properly
55 d.reset()
56 result = u""
57 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
58 result += d.decode(c)
59 self.assertEqual(result, partialresult)
60 # check that there's nothing left in the buffers
61 self.assertEqual(d.decode("", True), u"")
62 self.assertEqual(d.buffer, "")
63
64 # check iterdecode()
65 encoded = input.encode(self.encoding)
66 self.assertEqual(
67 input,
68 u"".join(codecs.iterdecode(encoded, self.encoding))
69 )
70
Walter Dörwalde57d7b12004-12-21 22:24:00 +000071 def test_readline(self):
72 def getreader(input):
73 stream = StringIO.StringIO(input.encode(self.encoding))
74 return codecs.getreader(self.encoding)(stream)
75
Walter Dörwaldca199432006-03-06 22:39:12 +000076 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 reader = getreader(input)
78 lines = []
79 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000080 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000081 if not line:
82 break
83 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000084 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000085
86 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000087 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
88 sexpectednoends = u"foo|bar|baz|spam|eggs"
89 self.assertEqual(readalllines(s, True), sexpected)
90 self.assertEqual(readalllines(s, False), sexpectednoends)
91 self.assertEqual(readalllines(s, True, 10), sexpected)
92 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000093
94 # Test long lines (multiple calls to read() in readline())
95 vw = []
96 vwo = []
97 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
98 vw.append((i*200)*u"\3042" + lineend)
99 vwo.append((i*200)*u"\3042")
100 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
101 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
102
103 # Test lines where the first read might end with \r, so the
104 # reader has to look ahead whether this is a lone \r or a \r\n
105 for size in xrange(80):
106 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000107 s = 10*(size*u"a" + lineend + u"xxx\n")
108 reader = getreader(s)
109 for i in xrange(10):
110 self.assertEqual(
111 reader.readline(keepends=True),
112 size*u"a" + lineend,
113 )
114 reader = getreader(s)
115 for i in xrange(10):
116 self.assertEqual(
117 reader.readline(keepends=False),
118 size*u"a",
119 )
120
121 def test_bug1175396(self):
122 s = [
123 '<%!--===================================================\r\n',
124 ' BLOG index page: show recent articles,\r\n',
125 ' today\'s articles, or articles of a specific date.\r\n',
126 '========================================================--%>\r\n',
127 '<%@inputencoding="ISO-8859-1"%>\r\n',
128 '<%@pagetemplate=TEMPLATE.y%>\r\n',
129 '<%@import=import frog.util, frog%>\r\n',
130 '<%@import=import frog.objects%>\r\n',
131 '<%@import=from frog.storageerrors import StorageError%>\r\n',
132 '<%\r\n',
133 '\r\n',
134 'import logging\r\n',
135 'log=logging.getLogger("Snakelets.logger")\r\n',
136 '\r\n',
137 '\r\n',
138 'user=self.SessionCtx.user\r\n',
139 'storageEngine=self.SessionCtx.storageEngine\r\n',
140 '\r\n',
141 '\r\n',
142 'def readArticlesFromDate(date, count=None):\r\n',
143 ' entryids=storageEngine.listBlogEntries(date)\r\n',
144 ' entryids.reverse() # descending\r\n',
145 ' if count:\r\n',
146 ' entryids=entryids[:count]\r\n',
147 ' try:\r\n',
148 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
149 ' except StorageError,x:\r\n',
150 ' log.error("Error loading articles: "+str(x))\r\n',
151 ' self.abort("cannot load articles")\r\n',
152 '\r\n',
153 'showdate=None\r\n',
154 '\r\n',
155 'arg=self.Request.getArg()\r\n',
156 'if arg=="today":\r\n',
157 ' #-------------------- TODAY\'S ARTICLES\r\n',
158 ' self.write("<h2>Today\'s articles</h2>")\r\n',
159 ' showdate = frog.util.isodatestr() \r\n',
160 ' entries = readArticlesFromDate(showdate)\r\n',
161 'elif arg=="active":\r\n',
162 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
163 ' self.Yredirect("active.y")\r\n',
164 'elif arg=="login":\r\n',
165 ' #-------------------- LOGIN PAGE redirect\r\n',
166 ' self.Yredirect("login.y")\r\n',
167 'elif arg=="date":\r\n',
168 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
169 ' showdate = self.Request.getParameter("date")\r\n',
170 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
171 ' entries = readArticlesFromDate(showdate)\r\n',
172 'else:\r\n',
173 ' #-------------------- RECENT ARTICLES\r\n',
174 ' self.write("<h2>Recent articles</h2>")\r\n',
175 ' dates=storageEngine.listBlogEntryDates()\r\n',
176 ' if dates:\r\n',
177 ' entries=[]\r\n',
178 ' SHOWAMOUNT=10\r\n',
179 ' for showdate in dates:\r\n',
180 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
181 ' if len(entries)>=SHOWAMOUNT:\r\n',
182 ' break\r\n',
183 ' \r\n',
184 ]
185 stream = StringIO.StringIO("".join(s).encode(self.encoding))
186 reader = codecs.getreader(self.encoding)(stream)
187 for (i, line) in enumerate(reader):
188 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000189
190 def test_readlinequeue(self):
191 q = Queue()
192 writer = codecs.getwriter(self.encoding)(q)
193 reader = codecs.getreader(self.encoding)(q)
194
195 # No lineends
196 writer.write(u"foo\r")
197 self.assertEqual(reader.readline(keepends=False), u"foo")
198 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000199 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000200 self.assertEqual(reader.readline(keepends=False), u"bar")
201 writer.write(u"baz")
202 self.assertEqual(reader.readline(keepends=False), u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"")
204
205 # Lineends
206 writer.write(u"foo\r")
207 self.assertEqual(reader.readline(keepends=True), u"foo\r")
208 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000209 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000210 self.assertEqual(reader.readline(keepends=True), u"bar\r")
211 writer.write(u"baz")
212 self.assertEqual(reader.readline(keepends=True), u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"")
214 writer.write(u"foo\r\n")
215 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
216
Walter Dörwald9fa09462005-01-10 12:01:39 +0000217 def test_bug1098990_a(self):
218 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
219 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
220 s3 = u"next line.\r\n"
221
222 s = (s1+s2+s3).encode(self.encoding)
223 stream = StringIO.StringIO(s)
224 reader = codecs.getreader(self.encoding)(stream)
225 self.assertEqual(reader.readline(), s1)
226 self.assertEqual(reader.readline(), s2)
227 self.assertEqual(reader.readline(), s3)
228 self.assertEqual(reader.readline(), u"")
229
230 def test_bug1098990_b(self):
231 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
232 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
233 s3 = u"stillokay:bbbbxx\r\n"
234 s4 = u"broken!!!!badbad\r\n"
235 s5 = u"againokay.\r\n"
236
237 s = (s1+s2+s3+s4+s5).encode(self.encoding)
238 stream = StringIO.StringIO(s)
239 reader = codecs.getreader(self.encoding)(stream)
240 self.assertEqual(reader.readline(), s1)
241 self.assertEqual(reader.readline(), s2)
242 self.assertEqual(reader.readline(), s3)
243 self.assertEqual(reader.readline(), s4)
244 self.assertEqual(reader.readline(), s5)
245 self.assertEqual(reader.readline(), u"")
246
Walter Dörwald6e390802007-08-17 16:41:28 +0000247class UTF32Test(ReadTest):
248 encoding = "utf-32"
249
250 spamle = ('\xff\xfe\x00\x00'
251 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
252 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
253 spambe = ('\x00\x00\xfe\xff'
254 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
255 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
256
257 def test_only_one_bom(self):
258 _,_,reader,writer = codecs.lookup(self.encoding)
259 # encode some stream
260 s = StringIO.StringIO()
261 f = writer(s)
262 f.write(u"spam")
263 f.write(u"spam")
264 d = s.getvalue()
265 # check whether there is exactly one BOM in it
266 self.assert_(d == self.spamle or d == self.spambe)
267 # try to read it back
268 s = StringIO.StringIO(d)
269 f = reader(s)
270 self.assertEquals(f.read(), u"spamspam")
271
272 def test_badbom(self):
273 s = StringIO.StringIO(4*"\xff")
274 f = codecs.getreader(self.encoding)(s)
275 self.assertRaises(UnicodeError, f.read)
276
277 s = StringIO.StringIO(8*"\xff")
278 f = codecs.getreader(self.encoding)(s)
279 self.assertRaises(UnicodeError, f.read)
280
281 def test_partial(self):
282 self.check_partial(
283 u"\x00\xff\u0100\uffff",
284 [
285 u"", # first byte of BOM read
286 u"", # second byte of BOM read
287 u"", # third byte of BOM read
288 u"", # fourth byte of BOM read => byteorder known
289 u"",
290 u"",
291 u"",
292 u"\x00",
293 u"\x00",
294 u"\x00",
295 u"\x00",
296 u"\x00\xff",
297 u"\x00\xff",
298 u"\x00\xff",
299 u"\x00\xff",
300 u"\x00\xff\u0100",
301 u"\x00\xff\u0100",
302 u"\x00\xff\u0100",
303 u"\x00\xff\u0100",
304 u"\x00\xff\u0100\uffff",
305 ]
306 )
307
308 def test_errors(self):
309 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
310 "\xff", "strict", True)
311
312class UTF32LETest(ReadTest):
313 encoding = "utf-32-le"
314
315 def test_partial(self):
316 self.check_partial(
317 u"\x00\xff\u0100\uffff",
318 [
319 u"",
320 u"",
321 u"",
322 u"\x00",
323 u"\x00",
324 u"\x00",
325 u"\x00",
326 u"\x00\xff",
327 u"\x00\xff",
328 u"\x00\xff",
329 u"\x00\xff",
330 u"\x00\xff\u0100",
331 u"\x00\xff\u0100",
332 u"\x00\xff\u0100",
333 u"\x00\xff\u0100",
334 u"\x00\xff\u0100\uffff",
335 ]
336 )
337
338 def test_simple(self):
339 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
340
341 def test_errors(self):
342 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
343 "\xff", "strict", True)
344
345class UTF32BETest(ReadTest):
346 encoding = "utf-32-be"
347
348 def test_partial(self):
349 self.check_partial(
350 u"\x00\xff\u0100\uffff",
351 [
352 u"",
353 u"",
354 u"",
355 u"\x00",
356 u"\x00",
357 u"\x00",
358 u"\x00",
359 u"\x00\xff",
360 u"\x00\xff",
361 u"\x00\xff",
362 u"\x00\xff",
363 u"\x00\xff\u0100",
364 u"\x00\xff\u0100",
365 u"\x00\xff\u0100",
366 u"\x00\xff\u0100",
367 u"\x00\xff\u0100\uffff",
368 ]
369 )
370
371 def test_simple(self):
372 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
373
374 def test_errors(self):
375 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
376 "\xff", "strict", True)
377
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000378class UTF16Test(ReadTest):
379 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000380
381 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
382 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
383
384 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000385 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000386 # encode some stream
387 s = StringIO.StringIO()
388 f = writer(s)
389 f.write(u"spam")
390 f.write(u"spam")
391 d = s.getvalue()
392 # check whether there is exactly one BOM in it
393 self.assert_(d == self.spamle or d == self.spambe)
394 # try to read it back
395 s = StringIO.StringIO(d)
396 f = reader(s)
397 self.assertEquals(f.read(), u"spamspam")
398
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000399 def test_badbom(self):
400 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000401 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000402 self.assertRaises(UnicodeError, f.read)
403
404 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000405 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000406 self.assertRaises(UnicodeError, f.read)
407
Walter Dörwald69652032004-09-07 20:24:22 +0000408 def test_partial(self):
409 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000410 u"\x00\xff\u0100\uffff",
411 [
412 u"", # first byte of BOM read
413 u"", # second byte of BOM read => byteorder known
414 u"",
415 u"\x00",
416 u"\x00",
417 u"\x00\xff",
418 u"\x00\xff",
419 u"\x00\xff\u0100",
420 u"\x00\xff\u0100",
421 u"\x00\xff\u0100\uffff",
422 ]
423 )
424
Walter Dörwalde22d3392005-11-17 08:52:34 +0000425 def test_errors(self):
426 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
427
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000428class UTF16LETest(ReadTest):
429 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000430
431 def test_partial(self):
432 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000433 u"\x00\xff\u0100\uffff",
434 [
435 u"",
436 u"\x00",
437 u"\x00",
438 u"\x00\xff",
439 u"\x00\xff",
440 u"\x00\xff\u0100",
441 u"\x00\xff\u0100",
442 u"\x00\xff\u0100\uffff",
443 ]
444 )
445
Walter Dörwalde22d3392005-11-17 08:52:34 +0000446 def test_errors(self):
447 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
448
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000449class UTF16BETest(ReadTest):
450 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000451
452 def test_partial(self):
453 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000454 u"\x00\xff\u0100\uffff",
455 [
456 u"",
457 u"\x00",
458 u"\x00",
459 u"\x00\xff",
460 u"\x00\xff",
461 u"\x00\xff\u0100",
462 u"\x00\xff\u0100",
463 u"\x00\xff\u0100\uffff",
464 ]
465 )
466
Walter Dörwalde22d3392005-11-17 08:52:34 +0000467 def test_errors(self):
468 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
469
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000470class UTF8Test(ReadTest):
471 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000472
473 def test_partial(self):
474 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000475 u"\x00\xff\u07ff\u0800\uffff",
476 [
477 u"\x00",
478 u"\x00",
479 u"\x00\xff",
480 u"\x00\xff",
481 u"\x00\xff\u07ff",
482 u"\x00\xff\u07ff",
483 u"\x00\xff\u07ff",
484 u"\x00\xff\u07ff\u0800",
485 u"\x00\xff\u07ff\u0800",
486 u"\x00\xff\u07ff\u0800",
487 u"\x00\xff\u07ff\u0800\uffff",
488 ]
489 )
490
Walter Dörwalde22d3392005-11-17 08:52:34 +0000491class UTF7Test(ReadTest):
492 encoding = "utf-7"
493
494 # No test_partial() yet, because UTF-7 doesn't support it.
495
496class UTF16ExTest(unittest.TestCase):
497
498 def test_errors(self):
499 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
500
501 def test_bad_args(self):
502 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
503
504class ReadBufferTest(unittest.TestCase):
505
506 def test_array(self):
507 import array
508 self.assertEqual(
509 codecs.readbuffer_encode(array.array("c", "spam")),
510 ("spam", 4)
511 )
512
513 def test_empty(self):
514 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
515
516 def test_bad_args(self):
517 self.assertRaises(TypeError, codecs.readbuffer_encode)
518 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
519
520class CharBufferTest(unittest.TestCase):
521
522 def test_string(self):
523 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
524
525 def test_empty(self):
526 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
527
528 def test_bad_args(self):
529 self.assertRaises(TypeError, codecs.charbuffer_encode)
530 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
531
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000532class UTF8SigTest(ReadTest):
533 encoding = "utf-8-sig"
534
535 def test_partial(self):
536 self.check_partial(
537 u"\ufeff\x00\xff\u07ff\u0800\uffff",
538 [
539 u"",
540 u"",
541 u"", # First BOM has been read and skipped
542 u"",
543 u"",
544 u"\ufeff", # Second BOM has been read and emitted
545 u"\ufeff\x00", # "\x00" read and emitted
546 u"\ufeff\x00", # First byte of encoded u"\xff" read
547 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
548 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
549 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
550 u"\ufeff\x00\xff\u07ff",
551 u"\ufeff\x00\xff\u07ff",
552 u"\ufeff\x00\xff\u07ff\u0800",
553 u"\ufeff\x00\xff\u07ff\u0800",
554 u"\ufeff\x00\xff\u07ff\u0800",
555 u"\ufeff\x00\xff\u07ff\u0800\uffff",
556 ]
557 )
558
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000559 def test_bug1601501(self):
560 # SF bug #1601501: check that the codec works with a buffer
561 unicode("\xef\xbb\xbf", "utf-8-sig")
562
Walter Dörwald42348272007-04-12 10:35:00 +0000563 def test_bom(self):
564 d = codecs.getincrementaldecoder("utf-8-sig")()
565 s = u"spam"
566 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
567
Walter Dörwald183744d2007-11-19 12:41:10 +0000568 def test_stream_bom(self):
569 unistring = u"ABC\u00A1\u2200XYZ"
570 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
571
572 reader = codecs.getreader("utf-8-sig")
573 for sizehint in [None] + range(1, 11) + \
574 [64, 128, 256, 512, 1024]:
575 istream = reader(StringIO.StringIO(bytestring))
576 ostream = StringIO.StringIO()
577 while 1:
578 if sizehint is not None:
579 data = istream.read(sizehint)
580 else:
581 data = istream.read()
582
583 if not data:
584 break
585 ostream.write(data)
586
587 got = ostream.getvalue()
588 self.assertEqual(got, unistring)
589
590 def test_stream_bare(self):
591 unistring = u"ABC\u00A1\u2200XYZ"
592 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
593
594 reader = codecs.getreader("utf-8-sig")
595 for sizehint in [None] + range(1, 11) + \
596 [64, 128, 256, 512, 1024]:
597 istream = reader(StringIO.StringIO(bytestring))
598 ostream = StringIO.StringIO()
599 while 1:
600 if sizehint is not None:
601 data = istream.read(sizehint)
602 else:
603 data = istream.read()
604
605 if not data:
606 break
607 ostream.write(data)
608
609 got = ostream.getvalue()
610 self.assertEqual(got, unistring)
611
Walter Dörwald8709a422002-09-03 13:53:40 +0000612class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000613 def test_empty(self):
Walter Dörwald8709a422002-09-03 13:53:40 +0000614 self.assertEquals(codecs.escape_decode(""), ("", 0))
615
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000616class RecodingTest(unittest.TestCase):
617 def test_recoding(self):
618 f = StringIO.StringIO()
619 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
620 f2.write(u"a")
621 f2.close()
622 # Python used to crash on this at exit because of a refcount
623 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000624
Martin v. Löwis2548c732003-04-18 10:39:54 +0000625# From RFC 3492
626punycode_testcases = [
627 # A Arabic (Egyptian):
628 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
629 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
630 "egbpdaj6bu4bxfgehfvwxn"),
631 # B Chinese (simplified):
632 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
633 "ihqwcrb4cv8a8dqg056pqjye"),
634 # C Chinese (traditional):
635 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
636 "ihqwctvzc91f659drss3x8bo0yb"),
637 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
638 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
639 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
640 u"\u0065\u0073\u006B\u0079",
641 "Proprostnemluvesky-uyb24dma41a"),
642 # E Hebrew:
643 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
644 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
645 u"\u05D1\u05E8\u05D9\u05EA",
646 "4dbcagdahymbxekheh6e0a7fei0b"),
647 # F Hindi (Devanagari):
648 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
649 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
650 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
651 u"\u0939\u0948\u0902",
652 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
653
654 #(G) Japanese (kanji and hiragana):
655 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
656 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
657 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
658
659 # (H) Korean (Hangul syllables):
660 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
661 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
662 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
663 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
664 "psd879ccm6fea98c"),
665
666 # (I) Russian (Cyrillic):
667 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
668 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
669 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
670 u"\u0438",
671 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
672
673 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
674 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
675 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
676 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
677 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
678 u"\u0061\u00F1\u006F\u006C",
679 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
680
681 # (K) Vietnamese:
682 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
683 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
684 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
685 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
686 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
687 u"\u0056\u0069\u1EC7\u0074",
688 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
689
Martin v. Löwis2548c732003-04-18 10:39:54 +0000690 #(L) 3<nen>B<gumi><kinpachi><sensei>
691 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
692 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000693
Martin v. Löwis2548c732003-04-18 10:39:54 +0000694 # (M) <amuro><namie>-with-SUPER-MONKEYS
695 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
696 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
697 u"\u004F\u004E\u004B\u0045\u0059\u0053",
698 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
699
700 # (N) Hello-Another-Way-<sorezore><no><basho>
701 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
702 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
703 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
704 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
705
706 # (O) <hitotsu><yane><no><shita>2
707 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
708 "2-u9tlzr9756bt3uc0v"),
709
710 # (P) Maji<de>Koi<suru>5<byou><mae>
711 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
712 u"\u308B\u0035\u79D2\u524D",
713 "MajiKoi5-783gue6qz075azm5e"),
714
715 # (Q) <pafii>de<runba>
716 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
717 "de-jg4avhby1noc0d"),
718
719 # (R) <sono><supiido><de>
720 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
721 "d9juau41awczczp"),
722
723 # (S) -> $1.00 <-
724 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
725 u"\u003C\u002D",
726 "-> $1.00 <--")
727 ]
728
729for i in punycode_testcases:
730 if len(i)!=2:
731 print repr(i)
732
733class PunycodeTest(unittest.TestCase):
734 def test_encode(self):
735 for uni, puny in punycode_testcases:
736 # Need to convert both strings to lower case, since
737 # some of the extended encodings use upper case, but our
738 # code produces only lower case. Converting just puny to
739 # lower is also insufficient, since some of the input characters
740 # are upper case.
741 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
742
743 def test_decode(self):
744 for uni, puny in punycode_testcases:
745 self.assertEquals(uni, puny.decode("punycode"))
746
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000747class UnicodeInternalTest(unittest.TestCase):
748 def test_bug1251300(self):
749 # Decoding with unicode_internal used to not correctly handle "code
750 # points" above 0x10ffff on UCS-4 builds.
751 if sys.maxunicode > 0xffff:
752 ok = [
753 ("\x00\x10\xff\xff", u"\U0010ffff"),
754 ("\x00\x00\x01\x01", u"\U00000101"),
755 ("", u""),
756 ]
757 not_ok = [
758 "\x7f\xff\xff\xff",
759 "\x80\x00\x00\x00",
760 "\x81\x00\x00\x00",
761 "\x00",
762 "\x00\x00\x00\x00\x00",
763 ]
764 for internal, uni in ok:
765 if sys.byteorder == "little":
766 internal = "".join(reversed(internal))
767 self.assertEquals(uni, internal.decode("unicode_internal"))
768 for internal in not_ok:
769 if sys.byteorder == "little":
770 internal = "".join(reversed(internal))
771 self.assertRaises(UnicodeDecodeError, internal.decode,
772 "unicode_internal")
773
774 def test_decode_error_attributes(self):
775 if sys.maxunicode > 0xffff:
776 try:
777 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
778 except UnicodeDecodeError, ex:
779 self.assertEquals("unicode_internal", ex.encoding)
780 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
781 self.assertEquals(4, ex.start)
782 self.assertEquals(8, ex.end)
783 else:
784 self.fail()
785
786 def test_decode_callback(self):
787 if sys.maxunicode > 0xffff:
788 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
789 decoder = codecs.getdecoder("unicode_internal")
790 ab = u"ab".encode("unicode_internal")
791 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
792 "UnicodeInternalTest")
793 self.assertEquals((u"ab", 12), ignored)
794
Martin v. Löwis2548c732003-04-18 10:39:54 +0000795# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
796nameprep_tests = [
797 # 3.1 Map to nothing.
798 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
799 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
800 '\xb8\x8f\xef\xbb\xbf',
801 'foobarbaz'),
802 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
803 ('CAFE',
804 'cafe'),
805 # 3.3 Case folding 8bit U+00DF (german sharp s).
806 # The original test case is bogus; it says \xc3\xdf
807 ('\xc3\x9f',
808 'ss'),
809 # 3.4 Case folding U+0130 (turkish capital I with dot).
810 ('\xc4\xb0',
811 'i\xcc\x87'),
812 # 3.5 Case folding multibyte U+0143 U+037A.
813 ('\xc5\x83\xcd\xba',
814 '\xc5\x84 \xce\xb9'),
815 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
816 # XXX: skip this as it fails in UCS-2 mode
817 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
818 # 'telc\xe2\x88\x95kg\xcf\x83'),
819 (None, None),
820 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
821 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
822 '\xc7\xb0 a'),
823 # 3.8 Case folding U+1FB7 and normalization.
824 ('\xe1\xbe\xb7',
825 '\xe1\xbe\xb6\xce\xb9'),
826 # 3.9 Self-reverting case folding U+01F0 and normalization.
827 # The original test case is bogus, it says `\xc7\xf0'
828 ('\xc7\xb0',
829 '\xc7\xb0'),
830 # 3.10 Self-reverting case folding U+0390 and normalization.
831 ('\xce\x90',
832 '\xce\x90'),
833 # 3.11 Self-reverting case folding U+03B0 and normalization.
834 ('\xce\xb0',
835 '\xce\xb0'),
836 # 3.12 Self-reverting case folding U+1E96 and normalization.
837 ('\xe1\xba\x96',
838 '\xe1\xba\x96'),
839 # 3.13 Self-reverting case folding U+1F56 and normalization.
840 ('\xe1\xbd\x96',
841 '\xe1\xbd\x96'),
842 # 3.14 ASCII space character U+0020.
843 (' ',
844 ' '),
845 # 3.15 Non-ASCII 8bit space character U+00A0.
846 ('\xc2\xa0',
847 ' '),
848 # 3.16 Non-ASCII multibyte space character U+1680.
849 ('\xe1\x9a\x80',
850 None),
851 # 3.17 Non-ASCII multibyte space character U+2000.
852 ('\xe2\x80\x80',
853 ' '),
854 # 3.18 Zero Width Space U+200b.
855 ('\xe2\x80\x8b',
856 ''),
857 # 3.19 Non-ASCII multibyte space character U+3000.
858 ('\xe3\x80\x80',
859 ' '),
860 # 3.20 ASCII control characters U+0010 U+007F.
861 ('\x10\x7f',
862 '\x10\x7f'),
863 # 3.21 Non-ASCII 8bit control character U+0085.
864 ('\xc2\x85',
865 None),
866 # 3.22 Non-ASCII multibyte control character U+180E.
867 ('\xe1\xa0\x8e',
868 None),
869 # 3.23 Zero Width No-Break Space U+FEFF.
870 ('\xef\xbb\xbf',
871 ''),
872 # 3.24 Non-ASCII control character U+1D175.
873 ('\xf0\x9d\x85\xb5',
874 None),
875 # 3.25 Plane 0 private use character U+F123.
876 ('\xef\x84\xa3',
877 None),
878 # 3.26 Plane 15 private use character U+F1234.
879 ('\xf3\xb1\x88\xb4',
880 None),
881 # 3.27 Plane 16 private use character U+10F234.
882 ('\xf4\x8f\x88\xb4',
883 None),
884 # 3.28 Non-character code point U+8FFFE.
885 ('\xf2\x8f\xbf\xbe',
886 None),
887 # 3.29 Non-character code point U+10FFFF.
888 ('\xf4\x8f\xbf\xbf',
889 None),
890 # 3.30 Surrogate code U+DF42.
891 ('\xed\xbd\x82',
892 None),
893 # 3.31 Non-plain text character U+FFFD.
894 ('\xef\xbf\xbd',
895 None),
896 # 3.32 Ideographic description character U+2FF5.
897 ('\xe2\xbf\xb5',
898 None),
899 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000900 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000901 '\xcc\x81'),
902 # 3.34 Left-to-right mark U+200E.
903 ('\xe2\x80\x8e',
904 None),
905 # 3.35 Deprecated U+202A.
906 ('\xe2\x80\xaa',
907 None),
908 # 3.36 Language tagging character U+E0001.
909 ('\xf3\xa0\x80\x81',
910 None),
911 # 3.37 Language tagging character U+E0042.
912 ('\xf3\xa0\x81\x82',
913 None),
914 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
915 ('foo\xd6\xbebar',
916 None),
917 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
918 ('foo\xef\xb5\x90bar',
919 None),
920 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
921 ('foo\xef\xb9\xb6bar',
922 'foo \xd9\x8ebar'),
923 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
924 ('\xd8\xa71',
925 None),
926 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
927 ('\xd8\xa71\xd8\xa8',
928 '\xd8\xa71\xd8\xa8'),
929 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000930 # Skip this test as we allow unassigned
931 #('\xf3\xa0\x80\x82',
932 # None),
933 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000934 # 3.44 Larger test (shrinking).
935 # Original test case reads \xc3\xdf
936 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
937 '\xaa\xce\xb0\xe2\x80\x80',
938 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
939 # 3.45 Larger test (expanding).
940 # Original test case reads \xc3\x9f
941 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
942 '\x80',
943 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
944 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
945 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
946 ]
947
948
949class NameprepTest(unittest.TestCase):
950 def test_nameprep(self):
951 from encodings.idna import nameprep
952 for pos, (orig, prepped) in enumerate(nameprep_tests):
953 if orig is None:
954 # Skipped
955 continue
956 # The Unicode strings are given in UTF-8
957 orig = unicode(orig, "utf-8")
958 if prepped is None:
959 # Input contains prohibited characters
960 self.assertRaises(UnicodeError, nameprep, orig)
961 else:
962 prepped = unicode(prepped, "utf-8")
963 try:
964 self.assertEquals(nameprep(orig), prepped)
965 except Exception,e:
966 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
967
Walter Dörwald78a0be62006-04-14 18:25:39 +0000968class IDNACodecTest(unittest.TestCase):
969 def test_builtin_decode(self):
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000970 self.assertEquals(unicode("python.org", "idna"), u"python.org")
Walter Dörwald78a0be62006-04-14 18:25:39 +0000971 self.assertEquals(unicode("python.org.", "idna"), u"python.org.")
972 self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
973 self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
974
975 def test_builtin_encode(self):
976 self.assertEquals(u"python.org".encode("idna"), "python.org")
977 self.assertEquals("python.org.".encode("idna"), "python.org.")
978 self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
979 self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000980
Martin v. Löwis8b595142005-08-25 11:03:38 +0000981 def test_stream(self):
982 import StringIO
983 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
984 r.read(3)
985 self.assertEquals(r.read(), u"")
986
Walter Dörwald78a0be62006-04-14 18:25:39 +0000987 def test_incremental_decode(self):
988 self.assertEquals(
989 "".join(codecs.iterdecode("python.org", "idna")),
990 u"python.org"
991 )
992 self.assertEquals(
993 "".join(codecs.iterdecode("python.org.", "idna")),
994 u"python.org."
995 )
996 self.assertEquals(
997 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
998 u"pyth\xf6n.org."
999 )
1000 self.assertEquals(
1001 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1002 u"pyth\xf6n.org."
1003 )
1004
1005 decoder = codecs.getincrementaldecoder("idna")()
1006 self.assertEquals(decoder.decode("xn--xam", ), u"")
1007 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1008 self.assertEquals(decoder.decode(u"rg"), u"")
1009 self.assertEquals(decoder.decode(u"", True), u"org")
1010
1011 decoder.reset()
1012 self.assertEquals(decoder.decode("xn--xam", ), u"")
1013 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1014 self.assertEquals(decoder.decode("rg."), u"org.")
1015 self.assertEquals(decoder.decode("", True), u"")
1016
1017 def test_incremental_encode(self):
1018 self.assertEquals(
1019 "".join(codecs.iterencode(u"python.org", "idna")),
1020 "python.org"
1021 )
1022 self.assertEquals(
1023 "".join(codecs.iterencode(u"python.org.", "idna")),
1024 "python.org."
1025 )
1026 self.assertEquals(
1027 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1028 "xn--pythn-mua.org."
1029 )
1030 self.assertEquals(
1031 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1032 "xn--pythn-mua.org."
1033 )
1034
1035 encoder = codecs.getincrementalencoder("idna")()
1036 self.assertEquals(encoder.encode(u"\xe4x"), "")
1037 self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1038 self.assertEquals(encoder.encode(u"", True), "org")
1039
1040 encoder.reset()
1041 self.assertEquals(encoder.encode(u"\xe4x"), "")
1042 self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1043 self.assertEquals(encoder.encode(u"", True), "")
1044
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001045class CodecsModuleTest(unittest.TestCase):
1046
1047 def test_decode(self):
1048 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
1049 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001050 self.assertRaises(TypeError, codecs.decode)
1051 self.assertEquals(codecs.decode('abc'), u'abc')
1052 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1053
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001054 def test_encode(self):
1055 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
1056 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001057 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001058 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwald063e1e82004-10-28 13:04:26 +00001059 self.assertEquals(codecs.encode(u'abc'), 'abc')
1060 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1061
1062 def test_register(self):
1063 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001064 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001065
1066 def test_lookup(self):
1067 self.assertRaises(TypeError, codecs.lookup)
1068 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001069 self.assertRaises(LookupError, codecs.lookup, " ")
1070
1071 def test_getencoder(self):
1072 self.assertRaises(TypeError, codecs.getencoder)
1073 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1074
1075 def test_getdecoder(self):
1076 self.assertRaises(TypeError, codecs.getdecoder)
1077 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1078
1079 def test_getreader(self):
1080 self.assertRaises(TypeError, codecs.getreader)
1081 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1082
1083 def test_getwriter(self):
1084 self.assertRaises(TypeError, codecs.getwriter)
1085 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001086
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001087class StreamReaderTest(unittest.TestCase):
1088
1089 def setUp(self):
1090 self.reader = codecs.getreader('utf-8')
1091 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1092
1093 def test_readlines(self):
1094 f = self.reader(self.stream)
1095 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
1096
Georg Brandl8f99f812006-10-29 08:39:22 +00001097class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001098
Georg Brandl8f99f812006-10-29 08:39:22 +00001099 def test_basic(self):
1100 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001101 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
1102 self.assertEquals(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001103
1104 f = StringIO.StringIO()
1105 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1106 ef.write('\xc3\xbc')
1107 self.assertEquals(f.getvalue(), '\xfc')
1108
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001109class Str2StrTest(unittest.TestCase):
1110
1111 def test_read(self):
1112 sin = "\x80".encode("base64_codec")
1113 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1114 sout = reader.read()
1115 self.assertEqual(sout, "\x80")
1116 self.assert_(isinstance(sout, str))
1117
1118 def test_readline(self):
1119 sin = "\x80".encode("base64_codec")
1120 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1121 sout = reader.readline()
1122 self.assertEqual(sout, "\x80")
1123 self.assert_(isinstance(sout, str))
1124
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001125all_unicode_encodings = [
1126 "ascii",
1127 "base64_codec",
1128 "big5",
1129 "big5hkscs",
1130 "charmap",
1131 "cp037",
1132 "cp1006",
1133 "cp1026",
1134 "cp1140",
1135 "cp1250",
1136 "cp1251",
1137 "cp1252",
1138 "cp1253",
1139 "cp1254",
1140 "cp1255",
1141 "cp1256",
1142 "cp1257",
1143 "cp1258",
1144 "cp424",
1145 "cp437",
1146 "cp500",
1147 "cp737",
1148 "cp775",
1149 "cp850",
1150 "cp852",
1151 "cp855",
1152 "cp856",
1153 "cp857",
1154 "cp860",
1155 "cp861",
1156 "cp862",
1157 "cp863",
1158 "cp864",
1159 "cp865",
1160 "cp866",
1161 "cp869",
1162 "cp874",
1163 "cp875",
1164 "cp932",
1165 "cp949",
1166 "cp950",
1167 "euc_jis_2004",
1168 "euc_jisx0213",
1169 "euc_jp",
1170 "euc_kr",
1171 "gb18030",
1172 "gb2312",
1173 "gbk",
1174 "hex_codec",
1175 "hp_roman8",
1176 "hz",
1177 "idna",
1178 "iso2022_jp",
1179 "iso2022_jp_1",
1180 "iso2022_jp_2",
1181 "iso2022_jp_2004",
1182 "iso2022_jp_3",
1183 "iso2022_jp_ext",
1184 "iso2022_kr",
1185 "iso8859_1",
1186 "iso8859_10",
1187 "iso8859_11",
1188 "iso8859_13",
1189 "iso8859_14",
1190 "iso8859_15",
1191 "iso8859_16",
1192 "iso8859_2",
1193 "iso8859_3",
1194 "iso8859_4",
1195 "iso8859_5",
1196 "iso8859_6",
1197 "iso8859_7",
1198 "iso8859_8",
1199 "iso8859_9",
1200 "johab",
1201 "koi8_r",
1202 "koi8_u",
1203 "latin_1",
1204 "mac_cyrillic",
1205 "mac_greek",
1206 "mac_iceland",
1207 "mac_latin2",
1208 "mac_roman",
1209 "mac_turkish",
1210 "palmos",
1211 "ptcp154",
1212 "punycode",
1213 "raw_unicode_escape",
1214 "rot_13",
1215 "shift_jis",
1216 "shift_jis_2004",
1217 "shift_jisx0213",
1218 "tis_620",
1219 "unicode_escape",
1220 "unicode_internal",
1221 "utf_16",
1222 "utf_16_be",
1223 "utf_16_le",
1224 "utf_7",
1225 "utf_8",
1226]
1227
1228if hasattr(codecs, "mbcs_encode"):
1229 all_unicode_encodings.append("mbcs")
1230
1231# The following encodings work only with str, not unicode
1232all_string_encodings = [
1233 "quopri_codec",
1234 "string_escape",
1235 "uu_codec",
1236]
1237
1238# The following encoding is not tested, because it's not supposed
1239# to work:
1240# "undefined"
1241
1242# The following encodings don't work in stateful mode
1243broken_unicode_with_streams = [
1244 "base64_codec",
1245 "hex_codec",
1246 "punycode",
1247 "unicode_internal"
1248]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001249broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001250
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001251# The following encodings only support "strict" mode
1252only_strict_mode = [
1253 "idna",
1254 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001255 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001256]
1257
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001258try:
1259 import bz2
1260except ImportError:
1261 pass
1262else:
1263 all_unicode_encodings.append("bz2_codec")
1264 broken_unicode_with_streams.append("bz2_codec")
1265
1266try:
1267 import zlib
1268except ImportError:
1269 pass
1270else:
1271 all_unicode_encodings.append("zlib_codec")
1272 broken_unicode_with_streams.append("zlib_codec")
1273
1274class BasicUnicodeTest(unittest.TestCase):
1275 def test_basics(self):
1276 s = u"abc123" # all codecs should be able to encode these
1277 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001278 name = codecs.lookup(encoding).name
1279 if encoding.endswith("_codec"):
1280 name += "_codec"
1281 elif encoding == "latin_1":
1282 name = "latin_1"
1283 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001284 (bytes, size) = codecs.getencoder(encoding)(s)
1285 if encoding != "unicode_internal":
1286 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1287 (chars, size) = codecs.getdecoder(encoding)(bytes)
1288 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1289
1290 if encoding not in broken_unicode_with_streams:
1291 # check stream reader/writer
1292 q = Queue()
1293 writer = codecs.getwriter(encoding)(q)
1294 encodedresult = ""
1295 for c in s:
1296 writer.write(c)
1297 encodedresult += q.read()
1298 q = Queue()
1299 reader = codecs.getreader(encoding)(q)
1300 decodedresult = u""
1301 for c in encodedresult:
1302 q.write(c)
1303 decodedresult += reader.read()
1304 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1305
Georg Brandl2c9838e2006-10-29 14:39:09 +00001306 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001307 # check incremental decoder/encoder (fetched via the Python
1308 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001309 try:
1310 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001311 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001312 except LookupError: # no IncrementalEncoder
1313 pass
1314 else:
1315 # check incremental decoder/encoder
1316 encodedresult = ""
1317 for c in s:
1318 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001319 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001320 decoder = codecs.getincrementaldecoder(encoding)()
1321 decodedresult = u""
1322 for c in encodedresult:
1323 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001324 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001325 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1326
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001327 # check C API
1328 encodedresult = ""
1329 for c in s:
1330 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001331 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001332 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1333 decodedresult = u""
1334 for c in encodedresult:
1335 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001336 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001337 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1338
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001339 # check iterencode()/iterdecode()
1340 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1341 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1342
1343 # check iterencode()/iterdecode() with empty string
1344 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1345 self.assertEqual(result, u"")
1346
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001347 if encoding not in only_strict_mode:
1348 # check incremental decoder/encoder with errors argument
1349 try:
1350 encoder = codecs.getincrementalencoder(encoding)("ignore")
1351 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1352 except LookupError: # no IncrementalEncoder
1353 pass
1354 else:
1355 encodedresult = "".join(encoder.encode(c) for c in s)
1356 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1357 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1358 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
Tim Petersabd8a332006-11-03 02:32:46 +00001359
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001360 encodedresult = "".join(cencoder.encode(c) for c in s)
1361 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1362 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1363 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1364
Walter Dörwald729c31f2005-03-14 19:06:30 +00001365 def test_seek(self):
1366 # all codecs should be able to encode these
1367 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1368 for encoding in all_unicode_encodings:
1369 if encoding == "idna": # FIXME: See SF bug #1163178
1370 continue
1371 if encoding in broken_unicode_with_streams:
1372 continue
1373 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1374 for t in xrange(5):
1375 # Test that calling seek resets the internal codec state and buffers
1376 reader.seek(0, 0)
1377 line = reader.readline()
1378 self.assertEqual(s[:len(line)], line)
1379
Walter Dörwalde22d3392005-11-17 08:52:34 +00001380 def test_bad_decode_args(self):
1381 for encoding in all_unicode_encodings:
1382 decoder = codecs.getdecoder(encoding)
1383 self.assertRaises(TypeError, decoder)
1384 if encoding not in ("idna", "punycode"):
1385 self.assertRaises(TypeError, decoder, 42)
1386
1387 def test_bad_encode_args(self):
1388 for encoding in all_unicode_encodings:
1389 encoder = codecs.getencoder(encoding)
1390 self.assertRaises(TypeError, encoder)
1391
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001392 def test_encoding_map_type_initialized(self):
1393 from encodings import cp1140
1394 # This used to crash, we are only verifying there's no crash.
1395 table_type = type(cp1140.encoding_table)
1396 self.assertEqual(table_type, table_type)
1397
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001398class BasicStrTest(unittest.TestCase):
1399 def test_basics(self):
1400 s = "abc123"
1401 for encoding in all_string_encodings:
1402 (bytes, size) = codecs.getencoder(encoding)(s)
1403 self.assertEqual(size, len(s))
1404 (chars, size) = codecs.getdecoder(encoding)(bytes)
1405 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1406
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001407class CharmapTest(unittest.TestCase):
1408 def test_decode_with_string_map(self):
1409 self.assertEquals(
1410 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1411 (u"abc", 3)
1412 )
1413
1414 self.assertEquals(
1415 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1416 (u"ab\ufffd", 3)
1417 )
1418
1419 self.assertEquals(
1420 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1421 (u"ab\ufffd", 3)
1422 )
1423
1424 self.assertEquals(
1425 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1426 (u"ab", 3)
1427 )
1428
1429 self.assertEquals(
1430 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1431 (u"ab", 3)
1432 )
1433
1434 allbytes = "".join(chr(i) for i in xrange(256))
1435 self.assertEquals(
1436 codecs.charmap_decode(allbytes, "ignore", u""),
1437 (u"", len(allbytes))
1438 )
1439
Georg Brandl8f99f812006-10-29 08:39:22 +00001440class WithStmtTest(unittest.TestCase):
1441 def test_encodedfile(self):
1442 f = StringIO.StringIO("\xc3\xbc")
1443 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1444 self.assertEquals(ef.read(), "\xfc")
1445
1446 def test_streamreaderwriter(self):
1447 f = StringIO.StringIO("\xc3\xbc")
1448 info = codecs.lookup("utf-8")
1449 with codecs.StreamReaderWriter(f, info.streamreader,
1450 info.streamwriter, 'strict') as srw:
1451 self.assertEquals(srw.read(), u"\xfc")
1452
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001453
Fred Drake2e2be372001-09-20 21:33:42 +00001454def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001455 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00001456 UTF32Test,
1457 UTF32LETest,
1458 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001459 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001460 UTF16LETest,
1461 UTF16BETest,
1462 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001463 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001464 UTF7Test,
1465 UTF16ExTest,
1466 ReadBufferTest,
1467 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001468 EscapeDecodeTest,
1469 RecodingTest,
1470 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001471 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001472 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00001473 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001474 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001475 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001476 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001477 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001478 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001479 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001480 CharmapTest,
1481 WithStmtTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001482 )
Fred Drake2e2be372001-09-20 21:33:42 +00001483
1484
1485if __name__ == "__main__":
1486 test_main()