blob: 5df6fe5e4e147146f652fbea33c7849c0d847672 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwald9ae019b2006-03-18 14:22:26 +00004import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
12
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwalde57d7b12004-12-21 22:24:00 +000026class ReadTest(unittest.TestCase):
27 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000028 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000029 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000030 # the StreamReader and check that the results equal the appropriate
31 # entries from partialresults.
32 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000033 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000034 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000035 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000036 q.write(c)
37 result += r.read()
38 self.assertEqual(result, partialresult)
39 # check that there's nothing left in the buffers
40 self.assertEqual(r.read(), u"")
41 self.assertEqual(r.bytebuffer, "")
42 self.assertEqual(r.charbuffer, u"")
43
Walter Dörwaldabb02e52006-03-15 11:35:15 +000044 # do the check again, this time using a incremental decoder
45 d = codecs.getincrementaldecoder(self.encoding)()
46 result = u""
47 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
48 result += d.decode(c)
49 self.assertEqual(result, partialresult)
50 # check that there's nothing left in the buffers
51 self.assertEqual(d.decode("", True), u"")
52 self.assertEqual(d.buffer, "")
53
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +000054 # Check whether the reset method works properly
Walter Dörwaldabb02e52006-03-15 11:35:15 +000055 d.reset()
56 result = u""
57 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
58 result += d.decode(c)
59 self.assertEqual(result, partialresult)
60 # check that there's nothing left in the buffers
61 self.assertEqual(d.decode("", True), u"")
62 self.assertEqual(d.buffer, "")
63
64 # check iterdecode()
65 encoded = input.encode(self.encoding)
66 self.assertEqual(
67 input,
68 u"".join(codecs.iterdecode(encoded, self.encoding))
69 )
70
Walter Dörwalde57d7b12004-12-21 22:24:00 +000071 def test_readline(self):
72 def getreader(input):
73 stream = StringIO.StringIO(input.encode(self.encoding))
74 return codecs.getreader(self.encoding)(stream)
75
Walter Dörwaldca199432006-03-06 22:39:12 +000076 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 reader = getreader(input)
78 lines = []
79 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000080 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000081 if not line:
82 break
83 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000084 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000085
86 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000087 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
88 sexpectednoends = u"foo|bar|baz|spam|eggs"
89 self.assertEqual(readalllines(s, True), sexpected)
90 self.assertEqual(readalllines(s, False), sexpectednoends)
91 self.assertEqual(readalllines(s, True, 10), sexpected)
92 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000093
94 # Test long lines (multiple calls to read() in readline())
95 vw = []
96 vwo = []
97 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
98 vw.append((i*200)*u"\3042" + lineend)
99 vwo.append((i*200)*u"\3042")
100 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
101 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
102
103 # Test lines where the first read might end with \r, so the
104 # reader has to look ahead whether this is a lone \r or a \r\n
105 for size in xrange(80):
106 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000107 s = 10*(size*u"a" + lineend + u"xxx\n")
108 reader = getreader(s)
109 for i in xrange(10):
110 self.assertEqual(
111 reader.readline(keepends=True),
112 size*u"a" + lineend,
113 )
114 reader = getreader(s)
115 for i in xrange(10):
116 self.assertEqual(
117 reader.readline(keepends=False),
118 size*u"a",
119 )
120
121 def test_bug1175396(self):
122 s = [
123 '<%!--===================================================\r\n',
124 ' BLOG index page: show recent articles,\r\n',
125 ' today\'s articles, or articles of a specific date.\r\n',
126 '========================================================--%>\r\n',
127 '<%@inputencoding="ISO-8859-1"%>\r\n',
128 '<%@pagetemplate=TEMPLATE.y%>\r\n',
129 '<%@import=import frog.util, frog%>\r\n',
130 '<%@import=import frog.objects%>\r\n',
131 '<%@import=from frog.storageerrors import StorageError%>\r\n',
132 '<%\r\n',
133 '\r\n',
134 'import logging\r\n',
135 'log=logging.getLogger("Snakelets.logger")\r\n',
136 '\r\n',
137 '\r\n',
138 'user=self.SessionCtx.user\r\n',
139 'storageEngine=self.SessionCtx.storageEngine\r\n',
140 '\r\n',
141 '\r\n',
142 'def readArticlesFromDate(date, count=None):\r\n',
143 ' entryids=storageEngine.listBlogEntries(date)\r\n',
144 ' entryids.reverse() # descending\r\n',
145 ' if count:\r\n',
146 ' entryids=entryids[:count]\r\n',
147 ' try:\r\n',
148 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
149 ' except StorageError,x:\r\n',
150 ' log.error("Error loading articles: "+str(x))\r\n',
151 ' self.abort("cannot load articles")\r\n',
152 '\r\n',
153 'showdate=None\r\n',
154 '\r\n',
155 'arg=self.Request.getArg()\r\n',
156 'if arg=="today":\r\n',
157 ' #-------------------- TODAY\'S ARTICLES\r\n',
158 ' self.write("<h2>Today\'s articles</h2>")\r\n',
159 ' showdate = frog.util.isodatestr() \r\n',
160 ' entries = readArticlesFromDate(showdate)\r\n',
161 'elif arg=="active":\r\n',
162 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
163 ' self.Yredirect("active.y")\r\n',
164 'elif arg=="login":\r\n',
165 ' #-------------------- LOGIN PAGE redirect\r\n',
166 ' self.Yredirect("login.y")\r\n',
167 'elif arg=="date":\r\n',
168 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
169 ' showdate = self.Request.getParameter("date")\r\n',
170 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
171 ' entries = readArticlesFromDate(showdate)\r\n',
172 'else:\r\n',
173 ' #-------------------- RECENT ARTICLES\r\n',
174 ' self.write("<h2>Recent articles</h2>")\r\n',
175 ' dates=storageEngine.listBlogEntryDates()\r\n',
176 ' if dates:\r\n',
177 ' entries=[]\r\n',
178 ' SHOWAMOUNT=10\r\n',
179 ' for showdate in dates:\r\n',
180 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
181 ' if len(entries)>=SHOWAMOUNT:\r\n',
182 ' break\r\n',
183 ' \r\n',
184 ]
185 stream = StringIO.StringIO("".join(s).encode(self.encoding))
186 reader = codecs.getreader(self.encoding)(stream)
187 for (i, line) in enumerate(reader):
188 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000189
190 def test_readlinequeue(self):
191 q = Queue()
192 writer = codecs.getwriter(self.encoding)(q)
193 reader = codecs.getreader(self.encoding)(q)
194
195 # No lineends
196 writer.write(u"foo\r")
197 self.assertEqual(reader.readline(keepends=False), u"foo")
198 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000199 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000200 self.assertEqual(reader.readline(keepends=False), u"bar")
201 writer.write(u"baz")
202 self.assertEqual(reader.readline(keepends=False), u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"")
204
205 # Lineends
206 writer.write(u"foo\r")
207 self.assertEqual(reader.readline(keepends=True), u"foo\r")
208 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000209 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000210 self.assertEqual(reader.readline(keepends=True), u"bar\r")
211 writer.write(u"baz")
212 self.assertEqual(reader.readline(keepends=True), u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"")
214 writer.write(u"foo\r\n")
215 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
216
Walter Dörwald9fa09462005-01-10 12:01:39 +0000217 def test_bug1098990_a(self):
218 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
219 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
220 s3 = u"next line.\r\n"
221
222 s = (s1+s2+s3).encode(self.encoding)
223 stream = StringIO.StringIO(s)
224 reader = codecs.getreader(self.encoding)(stream)
225 self.assertEqual(reader.readline(), s1)
226 self.assertEqual(reader.readline(), s2)
227 self.assertEqual(reader.readline(), s3)
228 self.assertEqual(reader.readline(), u"")
229
230 def test_bug1098990_b(self):
231 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
232 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
233 s3 = u"stillokay:bbbbxx\r\n"
234 s4 = u"broken!!!!badbad\r\n"
235 s5 = u"againokay.\r\n"
236
237 s = (s1+s2+s3+s4+s5).encode(self.encoding)
238 stream = StringIO.StringIO(s)
239 reader = codecs.getreader(self.encoding)(stream)
240 self.assertEqual(reader.readline(), s1)
241 self.assertEqual(reader.readline(), s2)
242 self.assertEqual(reader.readline(), s3)
243 self.assertEqual(reader.readline(), s4)
244 self.assertEqual(reader.readline(), s5)
245 self.assertEqual(reader.readline(), u"")
246
Walter Dörwald6e390802007-08-17 16:41:28 +0000247class UTF32Test(ReadTest):
248 encoding = "utf-32"
249
250 spamle = ('\xff\xfe\x00\x00'
251 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
252 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
253 spambe = ('\x00\x00\xfe\xff'
254 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
255 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
256
257 def test_only_one_bom(self):
258 _,_,reader,writer = codecs.lookup(self.encoding)
259 # encode some stream
260 s = StringIO.StringIO()
261 f = writer(s)
262 f.write(u"spam")
263 f.write(u"spam")
264 d = s.getvalue()
265 # check whether there is exactly one BOM in it
266 self.assert_(d == self.spamle or d == self.spambe)
267 # try to read it back
268 s = StringIO.StringIO(d)
269 f = reader(s)
270 self.assertEquals(f.read(), u"spamspam")
271
272 def test_badbom(self):
273 s = StringIO.StringIO(4*"\xff")
274 f = codecs.getreader(self.encoding)(s)
275 self.assertRaises(UnicodeError, f.read)
276
277 s = StringIO.StringIO(8*"\xff")
278 f = codecs.getreader(self.encoding)(s)
279 self.assertRaises(UnicodeError, f.read)
280
281 def test_partial(self):
282 self.check_partial(
283 u"\x00\xff\u0100\uffff",
284 [
285 u"", # first byte of BOM read
286 u"", # second byte of BOM read
287 u"", # third byte of BOM read
288 u"", # fourth byte of BOM read => byteorder known
289 u"",
290 u"",
291 u"",
292 u"\x00",
293 u"\x00",
294 u"\x00",
295 u"\x00",
296 u"\x00\xff",
297 u"\x00\xff",
298 u"\x00\xff",
299 u"\x00\xff",
300 u"\x00\xff\u0100",
301 u"\x00\xff\u0100",
302 u"\x00\xff\u0100",
303 u"\x00\xff\u0100",
304 u"\x00\xff\u0100\uffff",
305 ]
306 )
307
308 def test_errors(self):
309 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
310 "\xff", "strict", True)
311
312class UTF32LETest(ReadTest):
313 encoding = "utf-32-le"
314
315 def test_partial(self):
316 self.check_partial(
317 u"\x00\xff\u0100\uffff",
318 [
319 u"",
320 u"",
321 u"",
322 u"\x00",
323 u"\x00",
324 u"\x00",
325 u"\x00",
326 u"\x00\xff",
327 u"\x00\xff",
328 u"\x00\xff",
329 u"\x00\xff",
330 u"\x00\xff\u0100",
331 u"\x00\xff\u0100",
332 u"\x00\xff\u0100",
333 u"\x00\xff\u0100",
334 u"\x00\xff\u0100\uffff",
335 ]
336 )
337
338 def test_simple(self):
339 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
340
341 def test_errors(self):
342 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
343 "\xff", "strict", True)
344
345class UTF32BETest(ReadTest):
346 encoding = "utf-32-be"
347
348 def test_partial(self):
349 self.check_partial(
350 u"\x00\xff\u0100\uffff",
351 [
352 u"",
353 u"",
354 u"",
355 u"\x00",
356 u"\x00",
357 u"\x00",
358 u"\x00",
359 u"\x00\xff",
360 u"\x00\xff",
361 u"\x00\xff",
362 u"\x00\xff",
363 u"\x00\xff\u0100",
364 u"\x00\xff\u0100",
365 u"\x00\xff\u0100",
366 u"\x00\xff\u0100",
367 u"\x00\xff\u0100\uffff",
368 ]
369 )
370
371 def test_simple(self):
372 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
373
374 def test_errors(self):
375 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
376 "\xff", "strict", True)
377
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000378class UTF16Test(ReadTest):
379 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000380
381 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
382 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
383
384 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000385 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000386 # encode some stream
387 s = StringIO.StringIO()
388 f = writer(s)
389 f.write(u"spam")
390 f.write(u"spam")
391 d = s.getvalue()
392 # check whether there is exactly one BOM in it
393 self.assert_(d == self.spamle or d == self.spambe)
394 # try to read it back
395 s = StringIO.StringIO(d)
396 f = reader(s)
397 self.assertEquals(f.read(), u"spamspam")
398
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000399 def test_badbom(self):
400 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000401 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000402 self.assertRaises(UnicodeError, f.read)
403
404 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000405 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000406 self.assertRaises(UnicodeError, f.read)
407
Walter Dörwald69652032004-09-07 20:24:22 +0000408 def test_partial(self):
409 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000410 u"\x00\xff\u0100\uffff",
411 [
412 u"", # first byte of BOM read
413 u"", # second byte of BOM read => byteorder known
414 u"",
415 u"\x00",
416 u"\x00",
417 u"\x00\xff",
418 u"\x00\xff",
419 u"\x00\xff\u0100",
420 u"\x00\xff\u0100",
421 u"\x00\xff\u0100\uffff",
422 ]
423 )
424
Walter Dörwalde22d3392005-11-17 08:52:34 +0000425 def test_errors(self):
426 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
427
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000428class UTF16LETest(ReadTest):
429 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000430
431 def test_partial(self):
432 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000433 u"\x00\xff\u0100\uffff",
434 [
435 u"",
436 u"\x00",
437 u"\x00",
438 u"\x00\xff",
439 u"\x00\xff",
440 u"\x00\xff\u0100",
441 u"\x00\xff\u0100",
442 u"\x00\xff\u0100\uffff",
443 ]
444 )
445
Walter Dörwalde22d3392005-11-17 08:52:34 +0000446 def test_errors(self):
447 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
448
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000449class UTF16BETest(ReadTest):
450 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000451
452 def test_partial(self):
453 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000454 u"\x00\xff\u0100\uffff",
455 [
456 u"",
457 u"\x00",
458 u"\x00",
459 u"\x00\xff",
460 u"\x00\xff",
461 u"\x00\xff\u0100",
462 u"\x00\xff\u0100",
463 u"\x00\xff\u0100\uffff",
464 ]
465 )
466
Walter Dörwalde22d3392005-11-17 08:52:34 +0000467 def test_errors(self):
468 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
469
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000470class UTF8Test(ReadTest):
471 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000472
473 def test_partial(self):
474 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000475 u"\x00\xff\u07ff\u0800\uffff",
476 [
477 u"\x00",
478 u"\x00",
479 u"\x00\xff",
480 u"\x00\xff",
481 u"\x00\xff\u07ff",
482 u"\x00\xff\u07ff",
483 u"\x00\xff\u07ff",
484 u"\x00\xff\u07ff\u0800",
485 u"\x00\xff\u07ff\u0800",
486 u"\x00\xff\u07ff\u0800",
487 u"\x00\xff\u07ff\u0800\uffff",
488 ]
489 )
490
Walter Dörwalde22d3392005-11-17 08:52:34 +0000491class UTF7Test(ReadTest):
492 encoding = "utf-7"
493
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000494 def test_partial(self):
495 self.check_partial(
496 u"a+-b",
497 [
498 u"a",
499 u"a",
500 u"a+",
501 u"a+-",
502 u"a+-b",
503 ]
504 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000505
506class UTF16ExTest(unittest.TestCase):
507
508 def test_errors(self):
509 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
510
511 def test_bad_args(self):
512 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
513
514class ReadBufferTest(unittest.TestCase):
515
516 def test_array(self):
517 import array
518 self.assertEqual(
519 codecs.readbuffer_encode(array.array("c", "spam")),
520 ("spam", 4)
521 )
522
523 def test_empty(self):
524 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
525
526 def test_bad_args(self):
527 self.assertRaises(TypeError, codecs.readbuffer_encode)
528 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
529
530class CharBufferTest(unittest.TestCase):
531
532 def test_string(self):
533 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
534
535 def test_empty(self):
536 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
537
538 def test_bad_args(self):
539 self.assertRaises(TypeError, codecs.charbuffer_encode)
540 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
541
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000542class UTF8SigTest(ReadTest):
543 encoding = "utf-8-sig"
544
545 def test_partial(self):
546 self.check_partial(
547 u"\ufeff\x00\xff\u07ff\u0800\uffff",
548 [
549 u"",
550 u"",
551 u"", # First BOM has been read and skipped
552 u"",
553 u"",
554 u"\ufeff", # Second BOM has been read and emitted
555 u"\ufeff\x00", # "\x00" read and emitted
556 u"\ufeff\x00", # First byte of encoded u"\xff" read
557 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
558 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
559 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
560 u"\ufeff\x00\xff\u07ff",
561 u"\ufeff\x00\xff\u07ff",
562 u"\ufeff\x00\xff\u07ff\u0800",
563 u"\ufeff\x00\xff\u07ff\u0800",
564 u"\ufeff\x00\xff\u07ff\u0800",
565 u"\ufeff\x00\xff\u07ff\u0800\uffff",
566 ]
567 )
568
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000569 def test_bug1601501(self):
570 # SF bug #1601501: check that the codec works with a buffer
571 unicode("\xef\xbb\xbf", "utf-8-sig")
572
Walter Dörwald42348272007-04-12 10:35:00 +0000573 def test_bom(self):
574 d = codecs.getincrementaldecoder("utf-8-sig")()
575 s = u"spam"
576 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
577
Walter Dörwald183744d2007-11-19 12:41:10 +0000578 def test_stream_bom(self):
579 unistring = u"ABC\u00A1\u2200XYZ"
580 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
581
582 reader = codecs.getreader("utf-8-sig")
583 for sizehint in [None] + range(1, 11) + \
584 [64, 128, 256, 512, 1024]:
585 istream = reader(StringIO.StringIO(bytestring))
586 ostream = StringIO.StringIO()
587 while 1:
588 if sizehint is not None:
589 data = istream.read(sizehint)
590 else:
591 data = istream.read()
592
593 if not data:
594 break
595 ostream.write(data)
596
597 got = ostream.getvalue()
598 self.assertEqual(got, unistring)
599
600 def test_stream_bare(self):
601 unistring = u"ABC\u00A1\u2200XYZ"
602 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
603
604 reader = codecs.getreader("utf-8-sig")
605 for sizehint in [None] + range(1, 11) + \
606 [64, 128, 256, 512, 1024]:
607 istream = reader(StringIO.StringIO(bytestring))
608 ostream = StringIO.StringIO()
609 while 1:
610 if sizehint is not None:
611 data = istream.read(sizehint)
612 else:
613 data = istream.read()
614
615 if not data:
616 break
617 ostream.write(data)
618
619 got = ostream.getvalue()
620 self.assertEqual(got, unistring)
621
Walter Dörwald8709a422002-09-03 13:53:40 +0000622class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000623 def test_empty(self):
Walter Dörwald8709a422002-09-03 13:53:40 +0000624 self.assertEquals(codecs.escape_decode(""), ("", 0))
625
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000626class RecodingTest(unittest.TestCase):
627 def test_recoding(self):
628 f = StringIO.StringIO()
629 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
630 f2.write(u"a")
631 f2.close()
632 # Python used to crash on this at exit because of a refcount
633 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000634
Martin v. Löwis2548c732003-04-18 10:39:54 +0000635# From RFC 3492
636punycode_testcases = [
637 # A Arabic (Egyptian):
638 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
639 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
640 "egbpdaj6bu4bxfgehfvwxn"),
641 # B Chinese (simplified):
642 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
643 "ihqwcrb4cv8a8dqg056pqjye"),
644 # C Chinese (traditional):
645 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
646 "ihqwctvzc91f659drss3x8bo0yb"),
647 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
648 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
649 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
650 u"\u0065\u0073\u006B\u0079",
651 "Proprostnemluvesky-uyb24dma41a"),
652 # E Hebrew:
653 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
654 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
655 u"\u05D1\u05E8\u05D9\u05EA",
656 "4dbcagdahymbxekheh6e0a7fei0b"),
657 # F Hindi (Devanagari):
658 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
659 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
660 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
661 u"\u0939\u0948\u0902",
662 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
663
664 #(G) Japanese (kanji and hiragana):
665 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
666 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
667 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
668
669 # (H) Korean (Hangul syllables):
670 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
671 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
672 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
673 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
674 "psd879ccm6fea98c"),
675
676 # (I) Russian (Cyrillic):
677 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
678 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
679 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
680 u"\u0438",
681 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
682
683 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
684 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
685 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
686 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
687 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
688 u"\u0061\u00F1\u006F\u006C",
689 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
690
691 # (K) Vietnamese:
692 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
693 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
694 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
695 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
696 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
697 u"\u0056\u0069\u1EC7\u0074",
698 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
699
Martin v. Löwis2548c732003-04-18 10:39:54 +0000700 #(L) 3<nen>B<gumi><kinpachi><sensei>
701 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
702 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000703
Martin v. Löwis2548c732003-04-18 10:39:54 +0000704 # (M) <amuro><namie>-with-SUPER-MONKEYS
705 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
706 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
707 u"\u004F\u004E\u004B\u0045\u0059\u0053",
708 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
709
710 # (N) Hello-Another-Way-<sorezore><no><basho>
711 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
712 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
713 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
714 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
715
716 # (O) <hitotsu><yane><no><shita>2
717 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
718 "2-u9tlzr9756bt3uc0v"),
719
720 # (P) Maji<de>Koi<suru>5<byou><mae>
721 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
722 u"\u308B\u0035\u79D2\u524D",
723 "MajiKoi5-783gue6qz075azm5e"),
724
725 # (Q) <pafii>de<runba>
726 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
727 "de-jg4avhby1noc0d"),
728
729 # (R) <sono><supiido><de>
730 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
731 "d9juau41awczczp"),
732
733 # (S) -> $1.00 <-
734 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
735 u"\u003C\u002D",
736 "-> $1.00 <--")
737 ]
738
739for i in punycode_testcases:
740 if len(i)!=2:
741 print repr(i)
742
743class PunycodeTest(unittest.TestCase):
744 def test_encode(self):
745 for uni, puny in punycode_testcases:
746 # Need to convert both strings to lower case, since
747 # some of the extended encodings use upper case, but our
748 # code produces only lower case. Converting just puny to
749 # lower is also insufficient, since some of the input characters
750 # are upper case.
751 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
752
753 def test_decode(self):
754 for uni, puny in punycode_testcases:
755 self.assertEquals(uni, puny.decode("punycode"))
756
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000757class UnicodeInternalTest(unittest.TestCase):
758 def test_bug1251300(self):
759 # Decoding with unicode_internal used to not correctly handle "code
760 # points" above 0x10ffff on UCS-4 builds.
761 if sys.maxunicode > 0xffff:
762 ok = [
763 ("\x00\x10\xff\xff", u"\U0010ffff"),
764 ("\x00\x00\x01\x01", u"\U00000101"),
765 ("", u""),
766 ]
767 not_ok = [
768 "\x7f\xff\xff\xff",
769 "\x80\x00\x00\x00",
770 "\x81\x00\x00\x00",
771 "\x00",
772 "\x00\x00\x00\x00\x00",
773 ]
774 for internal, uni in ok:
775 if sys.byteorder == "little":
776 internal = "".join(reversed(internal))
777 self.assertEquals(uni, internal.decode("unicode_internal"))
778 for internal in not_ok:
779 if sys.byteorder == "little":
780 internal = "".join(reversed(internal))
781 self.assertRaises(UnicodeDecodeError, internal.decode,
782 "unicode_internal")
783
784 def test_decode_error_attributes(self):
785 if sys.maxunicode > 0xffff:
786 try:
787 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
788 except UnicodeDecodeError, ex:
789 self.assertEquals("unicode_internal", ex.encoding)
790 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
791 self.assertEquals(4, ex.start)
792 self.assertEquals(8, ex.end)
793 else:
794 self.fail()
795
796 def test_decode_callback(self):
797 if sys.maxunicode > 0xffff:
798 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
799 decoder = codecs.getdecoder("unicode_internal")
800 ab = u"ab".encode("unicode_internal")
801 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
802 "UnicodeInternalTest")
803 self.assertEquals((u"ab", 12), ignored)
804
Walter Dörwalda7fb4082009-05-06 14:28:24 +0000805 def test_encode_length(self):
806 # Issue 3739
807 encoder = codecs.getencoder("unicode_internal")
808 self.assertEquals(encoder(u"a")[1], 1)
809 self.assertEquals(encoder(u"\xe9\u0142")[1], 2)
810
Martin v. Löwis2548c732003-04-18 10:39:54 +0000811# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
812nameprep_tests = [
813 # 3.1 Map to nothing.
814 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
815 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
816 '\xb8\x8f\xef\xbb\xbf',
817 'foobarbaz'),
818 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
819 ('CAFE',
820 'cafe'),
821 # 3.3 Case folding 8bit U+00DF (german sharp s).
822 # The original test case is bogus; it says \xc3\xdf
823 ('\xc3\x9f',
824 'ss'),
825 # 3.4 Case folding U+0130 (turkish capital I with dot).
826 ('\xc4\xb0',
827 'i\xcc\x87'),
828 # 3.5 Case folding multibyte U+0143 U+037A.
829 ('\xc5\x83\xcd\xba',
830 '\xc5\x84 \xce\xb9'),
831 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
832 # XXX: skip this as it fails in UCS-2 mode
833 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
834 # 'telc\xe2\x88\x95kg\xcf\x83'),
835 (None, None),
836 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
837 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
838 '\xc7\xb0 a'),
839 # 3.8 Case folding U+1FB7 and normalization.
840 ('\xe1\xbe\xb7',
841 '\xe1\xbe\xb6\xce\xb9'),
842 # 3.9 Self-reverting case folding U+01F0 and normalization.
843 # The original test case is bogus, it says `\xc7\xf0'
844 ('\xc7\xb0',
845 '\xc7\xb0'),
846 # 3.10 Self-reverting case folding U+0390 and normalization.
847 ('\xce\x90',
848 '\xce\x90'),
849 # 3.11 Self-reverting case folding U+03B0 and normalization.
850 ('\xce\xb0',
851 '\xce\xb0'),
852 # 3.12 Self-reverting case folding U+1E96 and normalization.
853 ('\xe1\xba\x96',
854 '\xe1\xba\x96'),
855 # 3.13 Self-reverting case folding U+1F56 and normalization.
856 ('\xe1\xbd\x96',
857 '\xe1\xbd\x96'),
858 # 3.14 ASCII space character U+0020.
859 (' ',
860 ' '),
861 # 3.15 Non-ASCII 8bit space character U+00A0.
862 ('\xc2\xa0',
863 ' '),
864 # 3.16 Non-ASCII multibyte space character U+1680.
865 ('\xe1\x9a\x80',
866 None),
867 # 3.17 Non-ASCII multibyte space character U+2000.
868 ('\xe2\x80\x80',
869 ' '),
870 # 3.18 Zero Width Space U+200b.
871 ('\xe2\x80\x8b',
872 ''),
873 # 3.19 Non-ASCII multibyte space character U+3000.
874 ('\xe3\x80\x80',
875 ' '),
876 # 3.20 ASCII control characters U+0010 U+007F.
877 ('\x10\x7f',
878 '\x10\x7f'),
879 # 3.21 Non-ASCII 8bit control character U+0085.
880 ('\xc2\x85',
881 None),
882 # 3.22 Non-ASCII multibyte control character U+180E.
883 ('\xe1\xa0\x8e',
884 None),
885 # 3.23 Zero Width No-Break Space U+FEFF.
886 ('\xef\xbb\xbf',
887 ''),
888 # 3.24 Non-ASCII control character U+1D175.
889 ('\xf0\x9d\x85\xb5',
890 None),
891 # 3.25 Plane 0 private use character U+F123.
892 ('\xef\x84\xa3',
893 None),
894 # 3.26 Plane 15 private use character U+F1234.
895 ('\xf3\xb1\x88\xb4',
896 None),
897 # 3.27 Plane 16 private use character U+10F234.
898 ('\xf4\x8f\x88\xb4',
899 None),
900 # 3.28 Non-character code point U+8FFFE.
901 ('\xf2\x8f\xbf\xbe',
902 None),
903 # 3.29 Non-character code point U+10FFFF.
904 ('\xf4\x8f\xbf\xbf',
905 None),
906 # 3.30 Surrogate code U+DF42.
907 ('\xed\xbd\x82',
908 None),
909 # 3.31 Non-plain text character U+FFFD.
910 ('\xef\xbf\xbd',
911 None),
912 # 3.32 Ideographic description character U+2FF5.
913 ('\xe2\xbf\xb5',
914 None),
915 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000916 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000917 '\xcc\x81'),
918 # 3.34 Left-to-right mark U+200E.
919 ('\xe2\x80\x8e',
920 None),
921 # 3.35 Deprecated U+202A.
922 ('\xe2\x80\xaa',
923 None),
924 # 3.36 Language tagging character U+E0001.
925 ('\xf3\xa0\x80\x81',
926 None),
927 # 3.37 Language tagging character U+E0042.
928 ('\xf3\xa0\x81\x82',
929 None),
930 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
931 ('foo\xd6\xbebar',
932 None),
933 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
934 ('foo\xef\xb5\x90bar',
935 None),
936 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
937 ('foo\xef\xb9\xb6bar',
938 'foo \xd9\x8ebar'),
939 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
940 ('\xd8\xa71',
941 None),
942 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
943 ('\xd8\xa71\xd8\xa8',
944 '\xd8\xa71\xd8\xa8'),
945 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000946 # Skip this test as we allow unassigned
947 #('\xf3\xa0\x80\x82',
948 # None),
949 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000950 # 3.44 Larger test (shrinking).
951 # Original test case reads \xc3\xdf
952 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
953 '\xaa\xce\xb0\xe2\x80\x80',
954 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
955 # 3.45 Larger test (expanding).
956 # Original test case reads \xc3\x9f
957 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
958 '\x80',
959 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
960 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
961 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
962 ]
963
964
965class NameprepTest(unittest.TestCase):
966 def test_nameprep(self):
967 from encodings.idna import nameprep
968 for pos, (orig, prepped) in enumerate(nameprep_tests):
969 if orig is None:
970 # Skipped
971 continue
972 # The Unicode strings are given in UTF-8
973 orig = unicode(orig, "utf-8")
974 if prepped is None:
975 # Input contains prohibited characters
976 self.assertRaises(UnicodeError, nameprep, orig)
977 else:
978 prepped = unicode(prepped, "utf-8")
979 try:
980 self.assertEquals(nameprep(orig), prepped)
981 except Exception,e:
982 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
983
Walter Dörwald78a0be62006-04-14 18:25:39 +0000984class IDNACodecTest(unittest.TestCase):
985 def test_builtin_decode(self):
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000986 self.assertEquals(unicode("python.org", "idna"), u"python.org")
Walter Dörwald78a0be62006-04-14 18:25:39 +0000987 self.assertEquals(unicode("python.org.", "idna"), u"python.org.")
988 self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
989 self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
990
991 def test_builtin_encode(self):
992 self.assertEquals(u"python.org".encode("idna"), "python.org")
993 self.assertEquals("python.org.".encode("idna"), "python.org.")
994 self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
995 self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000996
Martin v. Löwis8b595142005-08-25 11:03:38 +0000997 def test_stream(self):
998 import StringIO
999 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1000 r.read(3)
1001 self.assertEquals(r.read(), u"")
1002
Walter Dörwald78a0be62006-04-14 18:25:39 +00001003 def test_incremental_decode(self):
1004 self.assertEquals(
1005 "".join(codecs.iterdecode("python.org", "idna")),
1006 u"python.org"
1007 )
1008 self.assertEquals(
1009 "".join(codecs.iterdecode("python.org.", "idna")),
1010 u"python.org."
1011 )
1012 self.assertEquals(
1013 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1014 u"pyth\xf6n.org."
1015 )
1016 self.assertEquals(
1017 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1018 u"pyth\xf6n.org."
1019 )
1020
1021 decoder = codecs.getincrementaldecoder("idna")()
1022 self.assertEquals(decoder.decode("xn--xam", ), u"")
1023 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1024 self.assertEquals(decoder.decode(u"rg"), u"")
1025 self.assertEquals(decoder.decode(u"", True), u"org")
1026
1027 decoder.reset()
1028 self.assertEquals(decoder.decode("xn--xam", ), u"")
1029 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1030 self.assertEquals(decoder.decode("rg."), u"org.")
1031 self.assertEquals(decoder.decode("", True), u"")
1032
1033 def test_incremental_encode(self):
1034 self.assertEquals(
1035 "".join(codecs.iterencode(u"python.org", "idna")),
1036 "python.org"
1037 )
1038 self.assertEquals(
1039 "".join(codecs.iterencode(u"python.org.", "idna")),
1040 "python.org."
1041 )
1042 self.assertEquals(
1043 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1044 "xn--pythn-mua.org."
1045 )
1046 self.assertEquals(
1047 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1048 "xn--pythn-mua.org."
1049 )
1050
1051 encoder = codecs.getincrementalencoder("idna")()
1052 self.assertEquals(encoder.encode(u"\xe4x"), "")
1053 self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1054 self.assertEquals(encoder.encode(u"", True), "org")
1055
1056 encoder.reset()
1057 self.assertEquals(encoder.encode(u"\xe4x"), "")
1058 self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1059 self.assertEquals(encoder.encode(u"", True), "")
1060
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001061class CodecsModuleTest(unittest.TestCase):
1062
1063 def test_decode(self):
1064 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
1065 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001066 self.assertRaises(TypeError, codecs.decode)
1067 self.assertEquals(codecs.decode('abc'), u'abc')
1068 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1069
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001070 def test_encode(self):
1071 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
1072 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001073 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001074 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwald063e1e82004-10-28 13:04:26 +00001075 self.assertEquals(codecs.encode(u'abc'), 'abc')
1076 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1077
1078 def test_register(self):
1079 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001080 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001081
1082 def test_lookup(self):
1083 self.assertRaises(TypeError, codecs.lookup)
1084 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001085 self.assertRaises(LookupError, codecs.lookup, " ")
1086
1087 def test_getencoder(self):
1088 self.assertRaises(TypeError, codecs.getencoder)
1089 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1090
1091 def test_getdecoder(self):
1092 self.assertRaises(TypeError, codecs.getdecoder)
1093 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1094
1095 def test_getreader(self):
1096 self.assertRaises(TypeError, codecs.getreader)
1097 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1098
1099 def test_getwriter(self):
1100 self.assertRaises(TypeError, codecs.getwriter)
1101 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001102
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001103class StreamReaderTest(unittest.TestCase):
1104
1105 def setUp(self):
1106 self.reader = codecs.getreader('utf-8')
1107 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1108
1109 def test_readlines(self):
1110 f = self.reader(self.stream)
1111 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
1112
Georg Brandl8f99f812006-10-29 08:39:22 +00001113class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001114
Georg Brandl8f99f812006-10-29 08:39:22 +00001115 def test_basic(self):
1116 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001117 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
1118 self.assertEquals(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001119
1120 f = StringIO.StringIO()
1121 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1122 ef.write('\xc3\xbc')
1123 self.assertEquals(f.getvalue(), '\xfc')
1124
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001125class Str2StrTest(unittest.TestCase):
1126
1127 def test_read(self):
1128 sin = "\x80".encode("base64_codec")
1129 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1130 sout = reader.read()
1131 self.assertEqual(sout, "\x80")
1132 self.assert_(isinstance(sout, str))
1133
1134 def test_readline(self):
1135 sin = "\x80".encode("base64_codec")
1136 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1137 sout = reader.readline()
1138 self.assertEqual(sout, "\x80")
1139 self.assert_(isinstance(sout, str))
1140
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001141all_unicode_encodings = [
1142 "ascii",
1143 "base64_codec",
1144 "big5",
1145 "big5hkscs",
1146 "charmap",
1147 "cp037",
1148 "cp1006",
1149 "cp1026",
1150 "cp1140",
1151 "cp1250",
1152 "cp1251",
1153 "cp1252",
1154 "cp1253",
1155 "cp1254",
1156 "cp1255",
1157 "cp1256",
1158 "cp1257",
1159 "cp1258",
1160 "cp424",
1161 "cp437",
1162 "cp500",
1163 "cp737",
1164 "cp775",
1165 "cp850",
1166 "cp852",
1167 "cp855",
1168 "cp856",
1169 "cp857",
1170 "cp860",
1171 "cp861",
1172 "cp862",
1173 "cp863",
1174 "cp864",
1175 "cp865",
1176 "cp866",
1177 "cp869",
1178 "cp874",
1179 "cp875",
1180 "cp932",
1181 "cp949",
1182 "cp950",
1183 "euc_jis_2004",
1184 "euc_jisx0213",
1185 "euc_jp",
1186 "euc_kr",
1187 "gb18030",
1188 "gb2312",
1189 "gbk",
1190 "hex_codec",
1191 "hp_roman8",
1192 "hz",
1193 "idna",
1194 "iso2022_jp",
1195 "iso2022_jp_1",
1196 "iso2022_jp_2",
1197 "iso2022_jp_2004",
1198 "iso2022_jp_3",
1199 "iso2022_jp_ext",
1200 "iso2022_kr",
1201 "iso8859_1",
1202 "iso8859_10",
1203 "iso8859_11",
1204 "iso8859_13",
1205 "iso8859_14",
1206 "iso8859_15",
1207 "iso8859_16",
1208 "iso8859_2",
1209 "iso8859_3",
1210 "iso8859_4",
1211 "iso8859_5",
1212 "iso8859_6",
1213 "iso8859_7",
1214 "iso8859_8",
1215 "iso8859_9",
1216 "johab",
1217 "koi8_r",
1218 "koi8_u",
1219 "latin_1",
1220 "mac_cyrillic",
1221 "mac_greek",
1222 "mac_iceland",
1223 "mac_latin2",
1224 "mac_roman",
1225 "mac_turkish",
1226 "palmos",
1227 "ptcp154",
1228 "punycode",
1229 "raw_unicode_escape",
1230 "rot_13",
1231 "shift_jis",
1232 "shift_jis_2004",
1233 "shift_jisx0213",
1234 "tis_620",
1235 "unicode_escape",
1236 "unicode_internal",
1237 "utf_16",
1238 "utf_16_be",
1239 "utf_16_le",
1240 "utf_7",
1241 "utf_8",
1242]
1243
1244if hasattr(codecs, "mbcs_encode"):
1245 all_unicode_encodings.append("mbcs")
1246
1247# The following encodings work only with str, not unicode
1248all_string_encodings = [
1249 "quopri_codec",
1250 "string_escape",
1251 "uu_codec",
1252]
1253
1254# The following encoding is not tested, because it's not supposed
1255# to work:
1256# "undefined"
1257
1258# The following encodings don't work in stateful mode
1259broken_unicode_with_streams = [
1260 "base64_codec",
1261 "hex_codec",
1262 "punycode",
1263 "unicode_internal"
1264]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001265broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001266
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001267# The following encodings only support "strict" mode
1268only_strict_mode = [
1269 "idna",
1270 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001271 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001272]
1273
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001274try:
1275 import bz2
1276except ImportError:
1277 pass
1278else:
1279 all_unicode_encodings.append("bz2_codec")
1280 broken_unicode_with_streams.append("bz2_codec")
1281
1282try:
1283 import zlib
1284except ImportError:
1285 pass
1286else:
1287 all_unicode_encodings.append("zlib_codec")
1288 broken_unicode_with_streams.append("zlib_codec")
1289
1290class BasicUnicodeTest(unittest.TestCase):
1291 def test_basics(self):
1292 s = u"abc123" # all codecs should be able to encode these
1293 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001294 name = codecs.lookup(encoding).name
1295 if encoding.endswith("_codec"):
1296 name += "_codec"
1297 elif encoding == "latin_1":
1298 name = "latin_1"
1299 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001300 (bytes, size) = codecs.getencoder(encoding)(s)
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001301 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001302 (chars, size) = codecs.getdecoder(encoding)(bytes)
1303 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1304
1305 if encoding not in broken_unicode_with_streams:
1306 # check stream reader/writer
1307 q = Queue()
1308 writer = codecs.getwriter(encoding)(q)
1309 encodedresult = ""
1310 for c in s:
1311 writer.write(c)
1312 encodedresult += q.read()
1313 q = Queue()
1314 reader = codecs.getreader(encoding)(q)
1315 decodedresult = u""
1316 for c in encodedresult:
1317 q.write(c)
1318 decodedresult += reader.read()
1319 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1320
Georg Brandl2c9838e2006-10-29 14:39:09 +00001321 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001322 # check incremental decoder/encoder (fetched via the Python
1323 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001324 try:
1325 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001326 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001327 except LookupError: # no IncrementalEncoder
1328 pass
1329 else:
1330 # check incremental decoder/encoder
1331 encodedresult = ""
1332 for c in s:
1333 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001334 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001335 decoder = codecs.getincrementaldecoder(encoding)()
1336 decodedresult = u""
1337 for c in encodedresult:
1338 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001339 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001340 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1341
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001342 # check C API
1343 encodedresult = ""
1344 for c in s:
1345 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001346 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001347 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1348 decodedresult = u""
1349 for c in encodedresult:
1350 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001351 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001352 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1353
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001354 # check iterencode()/iterdecode()
1355 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1356 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1357
1358 # check iterencode()/iterdecode() with empty string
1359 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1360 self.assertEqual(result, u"")
1361
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001362 if encoding not in only_strict_mode:
1363 # check incremental decoder/encoder with errors argument
1364 try:
1365 encoder = codecs.getincrementalencoder(encoding)("ignore")
1366 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1367 except LookupError: # no IncrementalEncoder
1368 pass
1369 else:
1370 encodedresult = "".join(encoder.encode(c) for c in s)
1371 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1372 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1373 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
Tim Petersabd8a332006-11-03 02:32:46 +00001374
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001375 encodedresult = "".join(cencoder.encode(c) for c in s)
1376 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1377 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1378 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1379
Walter Dörwald729c31f2005-03-14 19:06:30 +00001380 def test_seek(self):
1381 # all codecs should be able to encode these
1382 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1383 for encoding in all_unicode_encodings:
1384 if encoding == "idna": # FIXME: See SF bug #1163178
1385 continue
1386 if encoding in broken_unicode_with_streams:
1387 continue
1388 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1389 for t in xrange(5):
1390 # Test that calling seek resets the internal codec state and buffers
1391 reader.seek(0, 0)
1392 line = reader.readline()
1393 self.assertEqual(s[:len(line)], line)
1394
Walter Dörwalde22d3392005-11-17 08:52:34 +00001395 def test_bad_decode_args(self):
1396 for encoding in all_unicode_encodings:
1397 decoder = codecs.getdecoder(encoding)
1398 self.assertRaises(TypeError, decoder)
1399 if encoding not in ("idna", "punycode"):
1400 self.assertRaises(TypeError, decoder, 42)
1401
1402 def test_bad_encode_args(self):
1403 for encoding in all_unicode_encodings:
1404 encoder = codecs.getencoder(encoding)
1405 self.assertRaises(TypeError, encoder)
1406
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001407 def test_encoding_map_type_initialized(self):
1408 from encodings import cp1140
1409 # This used to crash, we are only verifying there's no crash.
1410 table_type = type(cp1140.encoding_table)
1411 self.assertEqual(table_type, table_type)
1412
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001413class BasicStrTest(unittest.TestCase):
1414 def test_basics(self):
1415 s = "abc123"
1416 for encoding in all_string_encodings:
1417 (bytes, size) = codecs.getencoder(encoding)(s)
1418 self.assertEqual(size, len(s))
1419 (chars, size) = codecs.getdecoder(encoding)(bytes)
1420 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1421
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001422class CharmapTest(unittest.TestCase):
1423 def test_decode_with_string_map(self):
1424 self.assertEquals(
1425 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1426 (u"abc", 3)
1427 )
1428
1429 self.assertEquals(
1430 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1431 (u"ab\ufffd", 3)
1432 )
1433
1434 self.assertEquals(
1435 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1436 (u"ab\ufffd", 3)
1437 )
1438
1439 self.assertEquals(
1440 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1441 (u"ab", 3)
1442 )
1443
1444 self.assertEquals(
1445 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1446 (u"ab", 3)
1447 )
1448
1449 allbytes = "".join(chr(i) for i in xrange(256))
1450 self.assertEquals(
1451 codecs.charmap_decode(allbytes, "ignore", u""),
1452 (u"", len(allbytes))
1453 )
1454
Georg Brandl8f99f812006-10-29 08:39:22 +00001455class WithStmtTest(unittest.TestCase):
1456 def test_encodedfile(self):
1457 f = StringIO.StringIO("\xc3\xbc")
1458 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1459 self.assertEquals(ef.read(), "\xfc")
1460
1461 def test_streamreaderwriter(self):
1462 f = StringIO.StringIO("\xc3\xbc")
1463 info = codecs.lookup("utf-8")
1464 with codecs.StreamReaderWriter(f, info.streamreader,
1465 info.streamwriter, 'strict') as srw:
1466 self.assertEquals(srw.read(), u"\xfc")
1467
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001468
Fred Drake2e2be372001-09-20 21:33:42 +00001469def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001470 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00001471 UTF32Test,
1472 UTF32LETest,
1473 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001474 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001475 UTF16LETest,
1476 UTF16BETest,
1477 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001478 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001479 UTF7Test,
1480 UTF16ExTest,
1481 ReadBufferTest,
1482 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001483 EscapeDecodeTest,
1484 RecodingTest,
1485 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001486 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001487 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00001488 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001489 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001490 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001491 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001492 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001493 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001494 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001495 CharmapTest,
1496 WithStmtTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001497 )
Fred Drake2e2be372001-09-20 21:33:42 +00001498
1499
1500if __name__ == "__main__":
1501 test_main()