blob: cee819ca0af0c1fbaa281023add45f7363451ff2 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwald9ae019b2006-03-18 14:22:26 +00004import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
12
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwalde57d7b12004-12-21 22:24:00 +000026class ReadTest(unittest.TestCase):
27 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000028 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000029 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000030 # the StreamReader and check that the results equal the appropriate
31 # entries from partialresults.
32 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000033 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000034 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000035 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000036 q.write(c)
37 result += r.read()
38 self.assertEqual(result, partialresult)
39 # check that there's nothing left in the buffers
40 self.assertEqual(r.read(), u"")
41 self.assertEqual(r.bytebuffer, "")
42 self.assertEqual(r.charbuffer, u"")
43
Walter Dörwaldabb02e52006-03-15 11:35:15 +000044 # do the check again, this time using a incremental decoder
45 d = codecs.getincrementaldecoder(self.encoding)()
46 result = u""
47 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
48 result += d.decode(c)
49 self.assertEqual(result, partialresult)
50 # check that there's nothing left in the buffers
51 self.assertEqual(d.decode("", True), u"")
52 self.assertEqual(d.buffer, "")
53
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +000054 # Check whether the reset method works properly
Walter Dörwaldabb02e52006-03-15 11:35:15 +000055 d.reset()
56 result = u""
57 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
58 result += d.decode(c)
59 self.assertEqual(result, partialresult)
60 # check that there's nothing left in the buffers
61 self.assertEqual(d.decode("", True), u"")
62 self.assertEqual(d.buffer, "")
63
64 # check iterdecode()
65 encoded = input.encode(self.encoding)
66 self.assertEqual(
67 input,
68 u"".join(codecs.iterdecode(encoded, self.encoding))
69 )
70
Walter Dörwalde57d7b12004-12-21 22:24:00 +000071 def test_readline(self):
72 def getreader(input):
73 stream = StringIO.StringIO(input.encode(self.encoding))
74 return codecs.getreader(self.encoding)(stream)
75
Walter Dörwaldca199432006-03-06 22:39:12 +000076 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 reader = getreader(input)
78 lines = []
79 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000080 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000081 if not line:
82 break
83 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000084 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000085
86 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000087 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
88 sexpectednoends = u"foo|bar|baz|spam|eggs"
89 self.assertEqual(readalllines(s, True), sexpected)
90 self.assertEqual(readalllines(s, False), sexpectednoends)
91 self.assertEqual(readalllines(s, True, 10), sexpected)
92 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000093
94 # Test long lines (multiple calls to read() in readline())
95 vw = []
96 vwo = []
97 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
98 vw.append((i*200)*u"\3042" + lineend)
99 vwo.append((i*200)*u"\3042")
100 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
101 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
102
103 # Test lines where the first read might end with \r, so the
104 # reader has to look ahead whether this is a lone \r or a \r\n
105 for size in xrange(80):
106 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000107 s = 10*(size*u"a" + lineend + u"xxx\n")
108 reader = getreader(s)
109 for i in xrange(10):
110 self.assertEqual(
111 reader.readline(keepends=True),
112 size*u"a" + lineend,
113 )
114 reader = getreader(s)
115 for i in xrange(10):
116 self.assertEqual(
117 reader.readline(keepends=False),
118 size*u"a",
119 )
120
121 def test_bug1175396(self):
122 s = [
123 '<%!--===================================================\r\n',
124 ' BLOG index page: show recent articles,\r\n',
125 ' today\'s articles, or articles of a specific date.\r\n',
126 '========================================================--%>\r\n',
127 '<%@inputencoding="ISO-8859-1"%>\r\n',
128 '<%@pagetemplate=TEMPLATE.y%>\r\n',
129 '<%@import=import frog.util, frog%>\r\n',
130 '<%@import=import frog.objects%>\r\n',
131 '<%@import=from frog.storageerrors import StorageError%>\r\n',
132 '<%\r\n',
133 '\r\n',
134 'import logging\r\n',
135 'log=logging.getLogger("Snakelets.logger")\r\n',
136 '\r\n',
137 '\r\n',
138 'user=self.SessionCtx.user\r\n',
139 'storageEngine=self.SessionCtx.storageEngine\r\n',
140 '\r\n',
141 '\r\n',
142 'def readArticlesFromDate(date, count=None):\r\n',
143 ' entryids=storageEngine.listBlogEntries(date)\r\n',
144 ' entryids.reverse() # descending\r\n',
145 ' if count:\r\n',
146 ' entryids=entryids[:count]\r\n',
147 ' try:\r\n',
148 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
149 ' except StorageError,x:\r\n',
150 ' log.error("Error loading articles: "+str(x))\r\n',
151 ' self.abort("cannot load articles")\r\n',
152 '\r\n',
153 'showdate=None\r\n',
154 '\r\n',
155 'arg=self.Request.getArg()\r\n',
156 'if arg=="today":\r\n',
157 ' #-------------------- TODAY\'S ARTICLES\r\n',
158 ' self.write("<h2>Today\'s articles</h2>")\r\n',
159 ' showdate = frog.util.isodatestr() \r\n',
160 ' entries = readArticlesFromDate(showdate)\r\n',
161 'elif arg=="active":\r\n',
162 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
163 ' self.Yredirect("active.y")\r\n',
164 'elif arg=="login":\r\n',
165 ' #-------------------- LOGIN PAGE redirect\r\n',
166 ' self.Yredirect("login.y")\r\n',
167 'elif arg=="date":\r\n',
168 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
169 ' showdate = self.Request.getParameter("date")\r\n',
170 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
171 ' entries = readArticlesFromDate(showdate)\r\n',
172 'else:\r\n',
173 ' #-------------------- RECENT ARTICLES\r\n',
174 ' self.write("<h2>Recent articles</h2>")\r\n',
175 ' dates=storageEngine.listBlogEntryDates()\r\n',
176 ' if dates:\r\n',
177 ' entries=[]\r\n',
178 ' SHOWAMOUNT=10\r\n',
179 ' for showdate in dates:\r\n',
180 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
181 ' if len(entries)>=SHOWAMOUNT:\r\n',
182 ' break\r\n',
183 ' \r\n',
184 ]
185 stream = StringIO.StringIO("".join(s).encode(self.encoding))
186 reader = codecs.getreader(self.encoding)(stream)
187 for (i, line) in enumerate(reader):
188 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000189
190 def test_readlinequeue(self):
191 q = Queue()
192 writer = codecs.getwriter(self.encoding)(q)
193 reader = codecs.getreader(self.encoding)(q)
194
195 # No lineends
196 writer.write(u"foo\r")
197 self.assertEqual(reader.readline(keepends=False), u"foo")
198 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000199 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000200 self.assertEqual(reader.readline(keepends=False), u"bar")
201 writer.write(u"baz")
202 self.assertEqual(reader.readline(keepends=False), u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"")
204
205 # Lineends
206 writer.write(u"foo\r")
207 self.assertEqual(reader.readline(keepends=True), u"foo\r")
208 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000209 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000210 self.assertEqual(reader.readline(keepends=True), u"bar\r")
211 writer.write(u"baz")
212 self.assertEqual(reader.readline(keepends=True), u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"")
214 writer.write(u"foo\r\n")
215 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
216
Walter Dörwald9fa09462005-01-10 12:01:39 +0000217 def test_bug1098990_a(self):
218 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
219 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
220 s3 = u"next line.\r\n"
221
222 s = (s1+s2+s3).encode(self.encoding)
223 stream = StringIO.StringIO(s)
224 reader = codecs.getreader(self.encoding)(stream)
225 self.assertEqual(reader.readline(), s1)
226 self.assertEqual(reader.readline(), s2)
227 self.assertEqual(reader.readline(), s3)
228 self.assertEqual(reader.readline(), u"")
229
230 def test_bug1098990_b(self):
231 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
232 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
233 s3 = u"stillokay:bbbbxx\r\n"
234 s4 = u"broken!!!!badbad\r\n"
235 s5 = u"againokay.\r\n"
236
237 s = (s1+s2+s3+s4+s5).encode(self.encoding)
238 stream = StringIO.StringIO(s)
239 reader = codecs.getreader(self.encoding)(stream)
240 self.assertEqual(reader.readline(), s1)
241 self.assertEqual(reader.readline(), s2)
242 self.assertEqual(reader.readline(), s3)
243 self.assertEqual(reader.readline(), s4)
244 self.assertEqual(reader.readline(), s5)
245 self.assertEqual(reader.readline(), u"")
246
Walter Dörwald6e390802007-08-17 16:41:28 +0000247class UTF32Test(ReadTest):
248 encoding = "utf-32"
249
250 spamle = ('\xff\xfe\x00\x00'
251 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
252 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
253 spambe = ('\x00\x00\xfe\xff'
254 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
255 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
256
257 def test_only_one_bom(self):
258 _,_,reader,writer = codecs.lookup(self.encoding)
259 # encode some stream
260 s = StringIO.StringIO()
261 f = writer(s)
262 f.write(u"spam")
263 f.write(u"spam")
264 d = s.getvalue()
265 # check whether there is exactly one BOM in it
266 self.assert_(d == self.spamle or d == self.spambe)
267 # try to read it back
268 s = StringIO.StringIO(d)
269 f = reader(s)
270 self.assertEquals(f.read(), u"spamspam")
271
272 def test_badbom(self):
273 s = StringIO.StringIO(4*"\xff")
274 f = codecs.getreader(self.encoding)(s)
275 self.assertRaises(UnicodeError, f.read)
276
277 s = StringIO.StringIO(8*"\xff")
278 f = codecs.getreader(self.encoding)(s)
279 self.assertRaises(UnicodeError, f.read)
280
281 def test_partial(self):
282 self.check_partial(
283 u"\x00\xff\u0100\uffff",
284 [
285 u"", # first byte of BOM read
286 u"", # second byte of BOM read
287 u"", # third byte of BOM read
288 u"", # fourth byte of BOM read => byteorder known
289 u"",
290 u"",
291 u"",
292 u"\x00",
293 u"\x00",
294 u"\x00",
295 u"\x00",
296 u"\x00\xff",
297 u"\x00\xff",
298 u"\x00\xff",
299 u"\x00\xff",
300 u"\x00\xff\u0100",
301 u"\x00\xff\u0100",
302 u"\x00\xff\u0100",
303 u"\x00\xff\u0100",
304 u"\x00\xff\u0100\uffff",
305 ]
306 )
307
308 def test_errors(self):
309 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
310 "\xff", "strict", True)
311
312class UTF32LETest(ReadTest):
313 encoding = "utf-32-le"
314
315 def test_partial(self):
316 self.check_partial(
317 u"\x00\xff\u0100\uffff",
318 [
319 u"",
320 u"",
321 u"",
322 u"\x00",
323 u"\x00",
324 u"\x00",
325 u"\x00",
326 u"\x00\xff",
327 u"\x00\xff",
328 u"\x00\xff",
329 u"\x00\xff",
330 u"\x00\xff\u0100",
331 u"\x00\xff\u0100",
332 u"\x00\xff\u0100",
333 u"\x00\xff\u0100",
334 u"\x00\xff\u0100\uffff",
335 ]
336 )
337
338 def test_simple(self):
339 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
340
341 def test_errors(self):
342 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
343 "\xff", "strict", True)
344
345class UTF32BETest(ReadTest):
346 encoding = "utf-32-be"
347
348 def test_partial(self):
349 self.check_partial(
350 u"\x00\xff\u0100\uffff",
351 [
352 u"",
353 u"",
354 u"",
355 u"\x00",
356 u"\x00",
357 u"\x00",
358 u"\x00",
359 u"\x00\xff",
360 u"\x00\xff",
361 u"\x00\xff",
362 u"\x00\xff",
363 u"\x00\xff\u0100",
364 u"\x00\xff\u0100",
365 u"\x00\xff\u0100",
366 u"\x00\xff\u0100",
367 u"\x00\xff\u0100\uffff",
368 ]
369 )
370
371 def test_simple(self):
372 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
373
374 def test_errors(self):
375 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
376 "\xff", "strict", True)
377
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000378class UTF16Test(ReadTest):
379 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000380
381 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
382 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
383
384 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000385 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000386 # encode some stream
387 s = StringIO.StringIO()
388 f = writer(s)
389 f.write(u"spam")
390 f.write(u"spam")
391 d = s.getvalue()
392 # check whether there is exactly one BOM in it
393 self.assert_(d == self.spamle or d == self.spambe)
394 # try to read it back
395 s = StringIO.StringIO(d)
396 f = reader(s)
397 self.assertEquals(f.read(), u"spamspam")
398
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000399 def test_badbom(self):
400 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000401 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000402 self.assertRaises(UnicodeError, f.read)
403
404 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000405 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000406 self.assertRaises(UnicodeError, f.read)
407
Walter Dörwald69652032004-09-07 20:24:22 +0000408 def test_partial(self):
409 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000410 u"\x00\xff\u0100\uffff",
411 [
412 u"", # first byte of BOM read
413 u"", # second byte of BOM read => byteorder known
414 u"",
415 u"\x00",
416 u"\x00",
417 u"\x00\xff",
418 u"\x00\xff",
419 u"\x00\xff\u0100",
420 u"\x00\xff\u0100",
421 u"\x00\xff\u0100\uffff",
422 ]
423 )
424
Walter Dörwalde22d3392005-11-17 08:52:34 +0000425 def test_errors(self):
426 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
427
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000428class UTF16LETest(ReadTest):
429 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000430
431 def test_partial(self):
432 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000433 u"\x00\xff\u0100\uffff",
434 [
435 u"",
436 u"\x00",
437 u"\x00",
438 u"\x00\xff",
439 u"\x00\xff",
440 u"\x00\xff\u0100",
441 u"\x00\xff\u0100",
442 u"\x00\xff\u0100\uffff",
443 ]
444 )
445
Walter Dörwalde22d3392005-11-17 08:52:34 +0000446 def test_errors(self):
447 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
448
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000449class UTF16BETest(ReadTest):
450 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000451
452 def test_partial(self):
453 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000454 u"\x00\xff\u0100\uffff",
455 [
456 u"",
457 u"\x00",
458 u"\x00",
459 u"\x00\xff",
460 u"\x00\xff",
461 u"\x00\xff\u0100",
462 u"\x00\xff\u0100",
463 u"\x00\xff\u0100\uffff",
464 ]
465 )
466
Walter Dörwalde22d3392005-11-17 08:52:34 +0000467 def test_errors(self):
468 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
469
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000470class UTF8Test(ReadTest):
471 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000472
473 def test_partial(self):
474 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000475 u"\x00\xff\u07ff\u0800\uffff",
476 [
477 u"\x00",
478 u"\x00",
479 u"\x00\xff",
480 u"\x00\xff",
481 u"\x00\xff\u07ff",
482 u"\x00\xff\u07ff",
483 u"\x00\xff\u07ff",
484 u"\x00\xff\u07ff\u0800",
485 u"\x00\xff\u07ff\u0800",
486 u"\x00\xff\u07ff\u0800",
487 u"\x00\xff\u07ff\u0800\uffff",
488 ]
489 )
490
Walter Dörwalde22d3392005-11-17 08:52:34 +0000491class UTF7Test(ReadTest):
492 encoding = "utf-7"
493
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000494 def test_partial(self):
495 self.check_partial(
496 u"a+-b",
497 [
498 u"a",
499 u"a",
500 u"a+",
501 u"a+-",
502 u"a+-b",
503 ]
504 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000505
506class UTF16ExTest(unittest.TestCase):
507
508 def test_errors(self):
509 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
510
511 def test_bad_args(self):
512 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
513
514class ReadBufferTest(unittest.TestCase):
515
516 def test_array(self):
517 import array
518 self.assertEqual(
519 codecs.readbuffer_encode(array.array("c", "spam")),
520 ("spam", 4)
521 )
522
523 def test_empty(self):
524 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
525
526 def test_bad_args(self):
527 self.assertRaises(TypeError, codecs.readbuffer_encode)
528 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
529
530class CharBufferTest(unittest.TestCase):
531
532 def test_string(self):
533 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
534
535 def test_empty(self):
536 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
537
538 def test_bad_args(self):
539 self.assertRaises(TypeError, codecs.charbuffer_encode)
540 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
541
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000542class UTF8SigTest(ReadTest):
543 encoding = "utf-8-sig"
544
545 def test_partial(self):
546 self.check_partial(
547 u"\ufeff\x00\xff\u07ff\u0800\uffff",
548 [
549 u"",
550 u"",
551 u"", # First BOM has been read and skipped
552 u"",
553 u"",
554 u"\ufeff", # Second BOM has been read and emitted
555 u"\ufeff\x00", # "\x00" read and emitted
556 u"\ufeff\x00", # First byte of encoded u"\xff" read
557 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
558 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
559 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
560 u"\ufeff\x00\xff\u07ff",
561 u"\ufeff\x00\xff\u07ff",
562 u"\ufeff\x00\xff\u07ff\u0800",
563 u"\ufeff\x00\xff\u07ff\u0800",
564 u"\ufeff\x00\xff\u07ff\u0800",
565 u"\ufeff\x00\xff\u07ff\u0800\uffff",
566 ]
567 )
568
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000569 def test_bug1601501(self):
570 # SF bug #1601501: check that the codec works with a buffer
571 unicode("\xef\xbb\xbf", "utf-8-sig")
572
Walter Dörwald42348272007-04-12 10:35:00 +0000573 def test_bom(self):
574 d = codecs.getincrementaldecoder("utf-8-sig")()
575 s = u"spam"
576 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
577
Walter Dörwald183744d2007-11-19 12:41:10 +0000578 def test_stream_bom(self):
579 unistring = u"ABC\u00A1\u2200XYZ"
580 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
581
582 reader = codecs.getreader("utf-8-sig")
583 for sizehint in [None] + range(1, 11) + \
584 [64, 128, 256, 512, 1024]:
585 istream = reader(StringIO.StringIO(bytestring))
586 ostream = StringIO.StringIO()
587 while 1:
588 if sizehint is not None:
589 data = istream.read(sizehint)
590 else:
591 data = istream.read()
592
593 if not data:
594 break
595 ostream.write(data)
596
597 got = ostream.getvalue()
598 self.assertEqual(got, unistring)
599
600 def test_stream_bare(self):
601 unistring = u"ABC\u00A1\u2200XYZ"
602 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
603
604 reader = codecs.getreader("utf-8-sig")
605 for sizehint in [None] + range(1, 11) + \
606 [64, 128, 256, 512, 1024]:
607 istream = reader(StringIO.StringIO(bytestring))
608 ostream = StringIO.StringIO()
609 while 1:
610 if sizehint is not None:
611 data = istream.read(sizehint)
612 else:
613 data = istream.read()
614
615 if not data:
616 break
617 ostream.write(data)
618
619 got = ostream.getvalue()
620 self.assertEqual(got, unistring)
621
Walter Dörwald8709a422002-09-03 13:53:40 +0000622class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000623 def test_empty(self):
Walter Dörwald8709a422002-09-03 13:53:40 +0000624 self.assertEquals(codecs.escape_decode(""), ("", 0))
625
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000626class RecodingTest(unittest.TestCase):
627 def test_recoding(self):
628 f = StringIO.StringIO()
629 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
630 f2.write(u"a")
631 f2.close()
632 # Python used to crash on this at exit because of a refcount
633 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000634
Martin v. Löwis2548c732003-04-18 10:39:54 +0000635# From RFC 3492
636punycode_testcases = [
637 # A Arabic (Egyptian):
638 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
639 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
640 "egbpdaj6bu4bxfgehfvwxn"),
641 # B Chinese (simplified):
642 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
643 "ihqwcrb4cv8a8dqg056pqjye"),
644 # C Chinese (traditional):
645 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
646 "ihqwctvzc91f659drss3x8bo0yb"),
647 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
648 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
649 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
650 u"\u0065\u0073\u006B\u0079",
651 "Proprostnemluvesky-uyb24dma41a"),
652 # E Hebrew:
653 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
654 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
655 u"\u05D1\u05E8\u05D9\u05EA",
656 "4dbcagdahymbxekheh6e0a7fei0b"),
657 # F Hindi (Devanagari):
658 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
659 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
660 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
661 u"\u0939\u0948\u0902",
662 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
663
664 #(G) Japanese (kanji and hiragana):
665 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
666 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
667 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
668
669 # (H) Korean (Hangul syllables):
670 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
671 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
672 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
673 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
674 "psd879ccm6fea98c"),
675
676 # (I) Russian (Cyrillic):
677 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
678 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
679 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
680 u"\u0438",
681 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
682
683 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
684 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
685 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
686 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
687 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
688 u"\u0061\u00F1\u006F\u006C",
689 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
690
691 # (K) Vietnamese:
692 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
693 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
694 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
695 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
696 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
697 u"\u0056\u0069\u1EC7\u0074",
698 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
699
Martin v. Löwis2548c732003-04-18 10:39:54 +0000700 #(L) 3<nen>B<gumi><kinpachi><sensei>
701 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
702 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000703
Martin v. Löwis2548c732003-04-18 10:39:54 +0000704 # (M) <amuro><namie>-with-SUPER-MONKEYS
705 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
706 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
707 u"\u004F\u004E\u004B\u0045\u0059\u0053",
708 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
709
710 # (N) Hello-Another-Way-<sorezore><no><basho>
711 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
712 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
713 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
714 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
715
716 # (O) <hitotsu><yane><no><shita>2
717 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
718 "2-u9tlzr9756bt3uc0v"),
719
720 # (P) Maji<de>Koi<suru>5<byou><mae>
721 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
722 u"\u308B\u0035\u79D2\u524D",
723 "MajiKoi5-783gue6qz075azm5e"),
724
725 # (Q) <pafii>de<runba>
726 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
727 "de-jg4avhby1noc0d"),
728
729 # (R) <sono><supiido><de>
730 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
731 "d9juau41awczczp"),
732
733 # (S) -> $1.00 <-
734 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
735 u"\u003C\u002D",
736 "-> $1.00 <--")
737 ]
738
739for i in punycode_testcases:
740 if len(i)!=2:
741 print repr(i)
742
743class PunycodeTest(unittest.TestCase):
744 def test_encode(self):
745 for uni, puny in punycode_testcases:
746 # Need to convert both strings to lower case, since
747 # some of the extended encodings use upper case, but our
748 # code produces only lower case. Converting just puny to
749 # lower is also insufficient, since some of the input characters
750 # are upper case.
751 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
752
753 def test_decode(self):
754 for uni, puny in punycode_testcases:
755 self.assertEquals(uni, puny.decode("punycode"))
756
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000757class UnicodeInternalTest(unittest.TestCase):
758 def test_bug1251300(self):
759 # Decoding with unicode_internal used to not correctly handle "code
760 # points" above 0x10ffff on UCS-4 builds.
761 if sys.maxunicode > 0xffff:
762 ok = [
763 ("\x00\x10\xff\xff", u"\U0010ffff"),
764 ("\x00\x00\x01\x01", u"\U00000101"),
765 ("", u""),
766 ]
767 not_ok = [
768 "\x7f\xff\xff\xff",
769 "\x80\x00\x00\x00",
770 "\x81\x00\x00\x00",
771 "\x00",
772 "\x00\x00\x00\x00\x00",
773 ]
774 for internal, uni in ok:
775 if sys.byteorder == "little":
776 internal = "".join(reversed(internal))
777 self.assertEquals(uni, internal.decode("unicode_internal"))
778 for internal in not_ok:
779 if sys.byteorder == "little":
780 internal = "".join(reversed(internal))
781 self.assertRaises(UnicodeDecodeError, internal.decode,
782 "unicode_internal")
783
784 def test_decode_error_attributes(self):
785 if sys.maxunicode > 0xffff:
786 try:
787 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
788 except UnicodeDecodeError, ex:
789 self.assertEquals("unicode_internal", ex.encoding)
790 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
791 self.assertEquals(4, ex.start)
792 self.assertEquals(8, ex.end)
793 else:
794 self.fail()
795
796 def test_decode_callback(self):
797 if sys.maxunicode > 0xffff:
798 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
799 decoder = codecs.getdecoder("unicode_internal")
800 ab = u"ab".encode("unicode_internal")
801 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
802 "UnicodeInternalTest")
803 self.assertEquals((u"ab", 12), ignored)
804
Martin v. Löwis2548c732003-04-18 10:39:54 +0000805# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
806nameprep_tests = [
807 # 3.1 Map to nothing.
808 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
809 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
810 '\xb8\x8f\xef\xbb\xbf',
811 'foobarbaz'),
812 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
813 ('CAFE',
814 'cafe'),
815 # 3.3 Case folding 8bit U+00DF (german sharp s).
816 # The original test case is bogus; it says \xc3\xdf
817 ('\xc3\x9f',
818 'ss'),
819 # 3.4 Case folding U+0130 (turkish capital I with dot).
820 ('\xc4\xb0',
821 'i\xcc\x87'),
822 # 3.5 Case folding multibyte U+0143 U+037A.
823 ('\xc5\x83\xcd\xba',
824 '\xc5\x84 \xce\xb9'),
825 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
826 # XXX: skip this as it fails in UCS-2 mode
827 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
828 # 'telc\xe2\x88\x95kg\xcf\x83'),
829 (None, None),
830 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
831 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
832 '\xc7\xb0 a'),
833 # 3.8 Case folding U+1FB7 and normalization.
834 ('\xe1\xbe\xb7',
835 '\xe1\xbe\xb6\xce\xb9'),
836 # 3.9 Self-reverting case folding U+01F0 and normalization.
837 # The original test case is bogus, it says `\xc7\xf0'
838 ('\xc7\xb0',
839 '\xc7\xb0'),
840 # 3.10 Self-reverting case folding U+0390 and normalization.
841 ('\xce\x90',
842 '\xce\x90'),
843 # 3.11 Self-reverting case folding U+03B0 and normalization.
844 ('\xce\xb0',
845 '\xce\xb0'),
846 # 3.12 Self-reverting case folding U+1E96 and normalization.
847 ('\xe1\xba\x96',
848 '\xe1\xba\x96'),
849 # 3.13 Self-reverting case folding U+1F56 and normalization.
850 ('\xe1\xbd\x96',
851 '\xe1\xbd\x96'),
852 # 3.14 ASCII space character U+0020.
853 (' ',
854 ' '),
855 # 3.15 Non-ASCII 8bit space character U+00A0.
856 ('\xc2\xa0',
857 ' '),
858 # 3.16 Non-ASCII multibyte space character U+1680.
859 ('\xe1\x9a\x80',
860 None),
861 # 3.17 Non-ASCII multibyte space character U+2000.
862 ('\xe2\x80\x80',
863 ' '),
864 # 3.18 Zero Width Space U+200b.
865 ('\xe2\x80\x8b',
866 ''),
867 # 3.19 Non-ASCII multibyte space character U+3000.
868 ('\xe3\x80\x80',
869 ' '),
870 # 3.20 ASCII control characters U+0010 U+007F.
871 ('\x10\x7f',
872 '\x10\x7f'),
873 # 3.21 Non-ASCII 8bit control character U+0085.
874 ('\xc2\x85',
875 None),
876 # 3.22 Non-ASCII multibyte control character U+180E.
877 ('\xe1\xa0\x8e',
878 None),
879 # 3.23 Zero Width No-Break Space U+FEFF.
880 ('\xef\xbb\xbf',
881 ''),
882 # 3.24 Non-ASCII control character U+1D175.
883 ('\xf0\x9d\x85\xb5',
884 None),
885 # 3.25 Plane 0 private use character U+F123.
886 ('\xef\x84\xa3',
887 None),
888 # 3.26 Plane 15 private use character U+F1234.
889 ('\xf3\xb1\x88\xb4',
890 None),
891 # 3.27 Plane 16 private use character U+10F234.
892 ('\xf4\x8f\x88\xb4',
893 None),
894 # 3.28 Non-character code point U+8FFFE.
895 ('\xf2\x8f\xbf\xbe',
896 None),
897 # 3.29 Non-character code point U+10FFFF.
898 ('\xf4\x8f\xbf\xbf',
899 None),
900 # 3.30 Surrogate code U+DF42.
901 ('\xed\xbd\x82',
902 None),
903 # 3.31 Non-plain text character U+FFFD.
904 ('\xef\xbf\xbd',
905 None),
906 # 3.32 Ideographic description character U+2FF5.
907 ('\xe2\xbf\xb5',
908 None),
909 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000910 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000911 '\xcc\x81'),
912 # 3.34 Left-to-right mark U+200E.
913 ('\xe2\x80\x8e',
914 None),
915 # 3.35 Deprecated U+202A.
916 ('\xe2\x80\xaa',
917 None),
918 # 3.36 Language tagging character U+E0001.
919 ('\xf3\xa0\x80\x81',
920 None),
921 # 3.37 Language tagging character U+E0042.
922 ('\xf3\xa0\x81\x82',
923 None),
924 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
925 ('foo\xd6\xbebar',
926 None),
927 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
928 ('foo\xef\xb5\x90bar',
929 None),
930 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
931 ('foo\xef\xb9\xb6bar',
932 'foo \xd9\x8ebar'),
933 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
934 ('\xd8\xa71',
935 None),
936 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
937 ('\xd8\xa71\xd8\xa8',
938 '\xd8\xa71\xd8\xa8'),
939 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000940 # Skip this test as we allow unassigned
941 #('\xf3\xa0\x80\x82',
942 # None),
943 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000944 # 3.44 Larger test (shrinking).
945 # Original test case reads \xc3\xdf
946 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
947 '\xaa\xce\xb0\xe2\x80\x80',
948 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
949 # 3.45 Larger test (expanding).
950 # Original test case reads \xc3\x9f
951 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
952 '\x80',
953 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
954 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
955 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
956 ]
957
958
959class NameprepTest(unittest.TestCase):
960 def test_nameprep(self):
961 from encodings.idna import nameprep
962 for pos, (orig, prepped) in enumerate(nameprep_tests):
963 if orig is None:
964 # Skipped
965 continue
966 # The Unicode strings are given in UTF-8
967 orig = unicode(orig, "utf-8")
968 if prepped is None:
969 # Input contains prohibited characters
970 self.assertRaises(UnicodeError, nameprep, orig)
971 else:
972 prepped = unicode(prepped, "utf-8")
973 try:
974 self.assertEquals(nameprep(orig), prepped)
975 except Exception,e:
976 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
977
Walter Dörwald78a0be62006-04-14 18:25:39 +0000978class IDNACodecTest(unittest.TestCase):
979 def test_builtin_decode(self):
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000980 self.assertEquals(unicode("python.org", "idna"), u"python.org")
Walter Dörwald78a0be62006-04-14 18:25:39 +0000981 self.assertEquals(unicode("python.org.", "idna"), u"python.org.")
982 self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
983 self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
984
985 def test_builtin_encode(self):
986 self.assertEquals(u"python.org".encode("idna"), "python.org")
987 self.assertEquals("python.org.".encode("idna"), "python.org.")
988 self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
989 self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000990
Martin v. Löwis8b595142005-08-25 11:03:38 +0000991 def test_stream(self):
992 import StringIO
993 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
994 r.read(3)
995 self.assertEquals(r.read(), u"")
996
Walter Dörwald78a0be62006-04-14 18:25:39 +0000997 def test_incremental_decode(self):
998 self.assertEquals(
999 "".join(codecs.iterdecode("python.org", "idna")),
1000 u"python.org"
1001 )
1002 self.assertEquals(
1003 "".join(codecs.iterdecode("python.org.", "idna")),
1004 u"python.org."
1005 )
1006 self.assertEquals(
1007 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1008 u"pyth\xf6n.org."
1009 )
1010 self.assertEquals(
1011 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1012 u"pyth\xf6n.org."
1013 )
1014
1015 decoder = codecs.getincrementaldecoder("idna")()
1016 self.assertEquals(decoder.decode("xn--xam", ), u"")
1017 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1018 self.assertEquals(decoder.decode(u"rg"), u"")
1019 self.assertEquals(decoder.decode(u"", True), u"org")
1020
1021 decoder.reset()
1022 self.assertEquals(decoder.decode("xn--xam", ), u"")
1023 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1024 self.assertEquals(decoder.decode("rg."), u"org.")
1025 self.assertEquals(decoder.decode("", True), u"")
1026
1027 def test_incremental_encode(self):
1028 self.assertEquals(
1029 "".join(codecs.iterencode(u"python.org", "idna")),
1030 "python.org"
1031 )
1032 self.assertEquals(
1033 "".join(codecs.iterencode(u"python.org.", "idna")),
1034 "python.org."
1035 )
1036 self.assertEquals(
1037 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1038 "xn--pythn-mua.org."
1039 )
1040 self.assertEquals(
1041 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1042 "xn--pythn-mua.org."
1043 )
1044
1045 encoder = codecs.getincrementalencoder("idna")()
1046 self.assertEquals(encoder.encode(u"\xe4x"), "")
1047 self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1048 self.assertEquals(encoder.encode(u"", True), "org")
1049
1050 encoder.reset()
1051 self.assertEquals(encoder.encode(u"\xe4x"), "")
1052 self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1053 self.assertEquals(encoder.encode(u"", True), "")
1054
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001055class CodecsModuleTest(unittest.TestCase):
1056
1057 def test_decode(self):
1058 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
1059 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001060 self.assertRaises(TypeError, codecs.decode)
1061 self.assertEquals(codecs.decode('abc'), u'abc')
1062 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1063
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001064 def test_encode(self):
1065 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
1066 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001067 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001068 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwald063e1e82004-10-28 13:04:26 +00001069 self.assertEquals(codecs.encode(u'abc'), 'abc')
1070 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1071
1072 def test_register(self):
1073 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001074 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001075
1076 def test_lookup(self):
1077 self.assertRaises(TypeError, codecs.lookup)
1078 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001079 self.assertRaises(LookupError, codecs.lookup, " ")
1080
1081 def test_getencoder(self):
1082 self.assertRaises(TypeError, codecs.getencoder)
1083 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1084
1085 def test_getdecoder(self):
1086 self.assertRaises(TypeError, codecs.getdecoder)
1087 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1088
1089 def test_getreader(self):
1090 self.assertRaises(TypeError, codecs.getreader)
1091 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1092
1093 def test_getwriter(self):
1094 self.assertRaises(TypeError, codecs.getwriter)
1095 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001096
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001097class StreamReaderTest(unittest.TestCase):
1098
1099 def setUp(self):
1100 self.reader = codecs.getreader('utf-8')
1101 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1102
1103 def test_readlines(self):
1104 f = self.reader(self.stream)
1105 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
1106
Georg Brandl8f99f812006-10-29 08:39:22 +00001107class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001108
Georg Brandl8f99f812006-10-29 08:39:22 +00001109 def test_basic(self):
1110 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001111 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
1112 self.assertEquals(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001113
1114 f = StringIO.StringIO()
1115 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1116 ef.write('\xc3\xbc')
1117 self.assertEquals(f.getvalue(), '\xfc')
1118
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001119class Str2StrTest(unittest.TestCase):
1120
1121 def test_read(self):
1122 sin = "\x80".encode("base64_codec")
1123 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1124 sout = reader.read()
1125 self.assertEqual(sout, "\x80")
1126 self.assert_(isinstance(sout, str))
1127
1128 def test_readline(self):
1129 sin = "\x80".encode("base64_codec")
1130 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1131 sout = reader.readline()
1132 self.assertEqual(sout, "\x80")
1133 self.assert_(isinstance(sout, str))
1134
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001135all_unicode_encodings = [
1136 "ascii",
1137 "base64_codec",
1138 "big5",
1139 "big5hkscs",
1140 "charmap",
1141 "cp037",
1142 "cp1006",
1143 "cp1026",
1144 "cp1140",
1145 "cp1250",
1146 "cp1251",
1147 "cp1252",
1148 "cp1253",
1149 "cp1254",
1150 "cp1255",
1151 "cp1256",
1152 "cp1257",
1153 "cp1258",
1154 "cp424",
1155 "cp437",
1156 "cp500",
1157 "cp737",
1158 "cp775",
1159 "cp850",
1160 "cp852",
1161 "cp855",
1162 "cp856",
1163 "cp857",
1164 "cp860",
1165 "cp861",
1166 "cp862",
1167 "cp863",
1168 "cp864",
1169 "cp865",
1170 "cp866",
1171 "cp869",
1172 "cp874",
1173 "cp875",
1174 "cp932",
1175 "cp949",
1176 "cp950",
1177 "euc_jis_2004",
1178 "euc_jisx0213",
1179 "euc_jp",
1180 "euc_kr",
1181 "gb18030",
1182 "gb2312",
1183 "gbk",
1184 "hex_codec",
1185 "hp_roman8",
1186 "hz",
1187 "idna",
1188 "iso2022_jp",
1189 "iso2022_jp_1",
1190 "iso2022_jp_2",
1191 "iso2022_jp_2004",
1192 "iso2022_jp_3",
1193 "iso2022_jp_ext",
1194 "iso2022_kr",
1195 "iso8859_1",
1196 "iso8859_10",
1197 "iso8859_11",
1198 "iso8859_13",
1199 "iso8859_14",
1200 "iso8859_15",
1201 "iso8859_16",
1202 "iso8859_2",
1203 "iso8859_3",
1204 "iso8859_4",
1205 "iso8859_5",
1206 "iso8859_6",
1207 "iso8859_7",
1208 "iso8859_8",
1209 "iso8859_9",
1210 "johab",
1211 "koi8_r",
1212 "koi8_u",
1213 "latin_1",
1214 "mac_cyrillic",
1215 "mac_greek",
1216 "mac_iceland",
1217 "mac_latin2",
1218 "mac_roman",
1219 "mac_turkish",
1220 "palmos",
1221 "ptcp154",
1222 "punycode",
1223 "raw_unicode_escape",
1224 "rot_13",
1225 "shift_jis",
1226 "shift_jis_2004",
1227 "shift_jisx0213",
1228 "tis_620",
1229 "unicode_escape",
1230 "unicode_internal",
1231 "utf_16",
1232 "utf_16_be",
1233 "utf_16_le",
1234 "utf_7",
1235 "utf_8",
1236]
1237
1238if hasattr(codecs, "mbcs_encode"):
1239 all_unicode_encodings.append("mbcs")
1240
1241# The following encodings work only with str, not unicode
1242all_string_encodings = [
1243 "quopri_codec",
1244 "string_escape",
1245 "uu_codec",
1246]
1247
1248# The following encoding is not tested, because it's not supposed
1249# to work:
1250# "undefined"
1251
1252# The following encodings don't work in stateful mode
1253broken_unicode_with_streams = [
1254 "base64_codec",
1255 "hex_codec",
1256 "punycode",
1257 "unicode_internal"
1258]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001259broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001260
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001261# The following encodings only support "strict" mode
1262only_strict_mode = [
1263 "idna",
1264 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001265 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001266]
1267
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001268try:
1269 import bz2
1270except ImportError:
1271 pass
1272else:
1273 all_unicode_encodings.append("bz2_codec")
1274 broken_unicode_with_streams.append("bz2_codec")
1275
1276try:
1277 import zlib
1278except ImportError:
1279 pass
1280else:
1281 all_unicode_encodings.append("zlib_codec")
1282 broken_unicode_with_streams.append("zlib_codec")
1283
1284class BasicUnicodeTest(unittest.TestCase):
1285 def test_basics(self):
1286 s = u"abc123" # all codecs should be able to encode these
1287 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001288 name = codecs.lookup(encoding).name
1289 if encoding.endswith("_codec"):
1290 name += "_codec"
1291 elif encoding == "latin_1":
1292 name = "latin_1"
1293 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001294 (bytes, size) = codecs.getencoder(encoding)(s)
1295 if encoding != "unicode_internal":
1296 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1297 (chars, size) = codecs.getdecoder(encoding)(bytes)
1298 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1299
1300 if encoding not in broken_unicode_with_streams:
1301 # check stream reader/writer
1302 q = Queue()
1303 writer = codecs.getwriter(encoding)(q)
1304 encodedresult = ""
1305 for c in s:
1306 writer.write(c)
1307 encodedresult += q.read()
1308 q = Queue()
1309 reader = codecs.getreader(encoding)(q)
1310 decodedresult = u""
1311 for c in encodedresult:
1312 q.write(c)
1313 decodedresult += reader.read()
1314 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1315
Georg Brandl2c9838e2006-10-29 14:39:09 +00001316 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001317 # check incremental decoder/encoder (fetched via the Python
1318 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001319 try:
1320 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001321 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001322 except LookupError: # no IncrementalEncoder
1323 pass
1324 else:
1325 # check incremental decoder/encoder
1326 encodedresult = ""
1327 for c in s:
1328 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001329 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001330 decoder = codecs.getincrementaldecoder(encoding)()
1331 decodedresult = u""
1332 for c in encodedresult:
1333 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001334 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001335 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1336
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001337 # check C API
1338 encodedresult = ""
1339 for c in s:
1340 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001341 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001342 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1343 decodedresult = u""
1344 for c in encodedresult:
1345 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001346 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001347 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1348
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001349 # check iterencode()/iterdecode()
1350 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1351 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1352
1353 # check iterencode()/iterdecode() with empty string
1354 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1355 self.assertEqual(result, u"")
1356
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001357 if encoding not in only_strict_mode:
1358 # check incremental decoder/encoder with errors argument
1359 try:
1360 encoder = codecs.getincrementalencoder(encoding)("ignore")
1361 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1362 except LookupError: # no IncrementalEncoder
1363 pass
1364 else:
1365 encodedresult = "".join(encoder.encode(c) for c in s)
1366 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1367 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1368 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
Tim Petersabd8a332006-11-03 02:32:46 +00001369
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001370 encodedresult = "".join(cencoder.encode(c) for c in s)
1371 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1372 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1373 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1374
Walter Dörwald729c31f2005-03-14 19:06:30 +00001375 def test_seek(self):
1376 # all codecs should be able to encode these
1377 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1378 for encoding in all_unicode_encodings:
1379 if encoding == "idna": # FIXME: See SF bug #1163178
1380 continue
1381 if encoding in broken_unicode_with_streams:
1382 continue
1383 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1384 for t in xrange(5):
1385 # Test that calling seek resets the internal codec state and buffers
1386 reader.seek(0, 0)
1387 line = reader.readline()
1388 self.assertEqual(s[:len(line)], line)
1389
Walter Dörwalde22d3392005-11-17 08:52:34 +00001390 def test_bad_decode_args(self):
1391 for encoding in all_unicode_encodings:
1392 decoder = codecs.getdecoder(encoding)
1393 self.assertRaises(TypeError, decoder)
1394 if encoding not in ("idna", "punycode"):
1395 self.assertRaises(TypeError, decoder, 42)
1396
1397 def test_bad_encode_args(self):
1398 for encoding in all_unicode_encodings:
1399 encoder = codecs.getencoder(encoding)
1400 self.assertRaises(TypeError, encoder)
1401
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001402 def test_encoding_map_type_initialized(self):
1403 from encodings import cp1140
1404 # This used to crash, we are only verifying there's no crash.
1405 table_type = type(cp1140.encoding_table)
1406 self.assertEqual(table_type, table_type)
1407
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001408class BasicStrTest(unittest.TestCase):
1409 def test_basics(self):
1410 s = "abc123"
1411 for encoding in all_string_encodings:
1412 (bytes, size) = codecs.getencoder(encoding)(s)
1413 self.assertEqual(size, len(s))
1414 (chars, size) = codecs.getdecoder(encoding)(bytes)
1415 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1416
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001417class CharmapTest(unittest.TestCase):
1418 def test_decode_with_string_map(self):
1419 self.assertEquals(
1420 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1421 (u"abc", 3)
1422 )
1423
1424 self.assertEquals(
1425 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1426 (u"ab\ufffd", 3)
1427 )
1428
1429 self.assertEquals(
1430 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1431 (u"ab\ufffd", 3)
1432 )
1433
1434 self.assertEquals(
1435 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1436 (u"ab", 3)
1437 )
1438
1439 self.assertEquals(
1440 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1441 (u"ab", 3)
1442 )
1443
1444 allbytes = "".join(chr(i) for i in xrange(256))
1445 self.assertEquals(
1446 codecs.charmap_decode(allbytes, "ignore", u""),
1447 (u"", len(allbytes))
1448 )
1449
Georg Brandl8f99f812006-10-29 08:39:22 +00001450class WithStmtTest(unittest.TestCase):
1451 def test_encodedfile(self):
1452 f = StringIO.StringIO("\xc3\xbc")
1453 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1454 self.assertEquals(ef.read(), "\xfc")
1455
1456 def test_streamreaderwriter(self):
1457 f = StringIO.StringIO("\xc3\xbc")
1458 info = codecs.lookup("utf-8")
1459 with codecs.StreamReaderWriter(f, info.streamreader,
1460 info.streamwriter, 'strict') as srw:
1461 self.assertEquals(srw.read(), u"\xfc")
1462
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001463
Fred Drake2e2be372001-09-20 21:33:42 +00001464def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001465 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00001466 UTF32Test,
1467 UTF32LETest,
1468 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001469 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001470 UTF16LETest,
1471 UTF16BETest,
1472 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001473 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001474 UTF7Test,
1475 UTF16ExTest,
1476 ReadBufferTest,
1477 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001478 EscapeDecodeTest,
1479 RecodingTest,
1480 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001481 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001482 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00001483 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001484 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001485 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001486 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001487 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001488 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001489 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001490 CharmapTest,
1491 WithStmtTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001492 )
Fred Drake2e2be372001-09-20 21:33:42 +00001493
1494
1495if __name__ == "__main__":
1496 test_main()