blob: 523343454d87e216bbb20525497ff873f0c30d60 [file] [log] [blame]
Georg Brandl2a5a3022006-10-29 08:39:27 +00001from __future__ import with_statement
Barry Warsaw04f357c2002-07-23 19:04:11 +00002from test import test_support
3import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00004import codecs
Walter Dörwald9ae019b2006-03-18 14:22:26 +00005import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Walter Dörwald69652032004-09-07 20:24:22 +00007class Queue(object):
8 """
9 queue: write bytes at one end, read bytes from the other end
10 """
11 def __init__(self):
12 self._buffer = ""
13
14 def write(self, chars):
15 self._buffer += chars
16
17 def read(self, size=-1):
18 if size<0:
19 s = self._buffer
20 self._buffer = ""
21 return s
22 else:
23 s = self._buffer[:size]
24 self._buffer = self._buffer[size:]
25 return s
26
Walter Dörwalde57d7b12004-12-21 22:24:00 +000027class ReadTest(unittest.TestCase):
28 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000029 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldad91c782007-11-19 12:23:44 +000030 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000031 # the StreamReader and check that the results equal the appropriate
32 # entries from partialresults.
33 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000034 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000035 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000036 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000037 q.write(c)
38 result += r.read()
39 self.assertEqual(result, partialresult)
40 # check that there's nothing left in the buffers
41 self.assertEqual(r.read(), u"")
42 self.assertEqual(r.bytebuffer, "")
43 self.assertEqual(r.charbuffer, u"")
44
Walter Dörwaldabb02e52006-03-15 11:35:15 +000045 # do the check again, this time using a incremental decoder
46 d = codecs.getincrementaldecoder(self.encoding)()
47 result = u""
48 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
49 result += d.decode(c)
50 self.assertEqual(result, partialresult)
51 # check that there's nothing left in the buffers
52 self.assertEqual(d.decode("", True), u"")
53 self.assertEqual(d.buffer, "")
54
55 # Check whether the rest method works properly
56 d.reset()
57 result = u""
58 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
59 result += d.decode(c)
60 self.assertEqual(result, partialresult)
61 # check that there's nothing left in the buffers
62 self.assertEqual(d.decode("", True), u"")
63 self.assertEqual(d.buffer, "")
64
65 # check iterdecode()
66 encoded = input.encode(self.encoding)
67 self.assertEqual(
68 input,
69 u"".join(codecs.iterdecode(encoded, self.encoding))
70 )
71
Walter Dörwalde57d7b12004-12-21 22:24:00 +000072 def test_readline(self):
73 def getreader(input):
74 stream = StringIO.StringIO(input.encode(self.encoding))
75 return codecs.getreader(self.encoding)(stream)
76
Walter Dörwaldca199432006-03-06 22:39:12 +000077 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000078 reader = getreader(input)
79 lines = []
80 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000081 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000082 if not line:
83 break
84 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000085 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000086
87 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000088 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
89 sexpectednoends = u"foo|bar|baz|spam|eggs"
90 self.assertEqual(readalllines(s, True), sexpected)
91 self.assertEqual(readalllines(s, False), sexpectednoends)
92 self.assertEqual(readalllines(s, True, 10), sexpected)
93 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000094
95 # Test long lines (multiple calls to read() in readline())
96 vw = []
97 vwo = []
98 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
99 vw.append((i*200)*u"\3042" + lineend)
100 vwo.append((i*200)*u"\3042")
101 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
102 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
103
104 # Test lines where the first read might end with \r, so the
105 # reader has to look ahead whether this is a lone \r or a \r\n
106 for size in xrange(80):
107 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000108 s = 10*(size*u"a" + lineend + u"xxx\n")
109 reader = getreader(s)
110 for i in xrange(10):
111 self.assertEqual(
112 reader.readline(keepends=True),
113 size*u"a" + lineend,
114 )
115 reader = getreader(s)
116 for i in xrange(10):
117 self.assertEqual(
118 reader.readline(keepends=False),
119 size*u"a",
120 )
121
122 def test_bug1175396(self):
123 s = [
124 '<%!--===================================================\r\n',
125 ' BLOG index page: show recent articles,\r\n',
126 ' today\'s articles, or articles of a specific date.\r\n',
127 '========================================================--%>\r\n',
128 '<%@inputencoding="ISO-8859-1"%>\r\n',
129 '<%@pagetemplate=TEMPLATE.y%>\r\n',
130 '<%@import=import frog.util, frog%>\r\n',
131 '<%@import=import frog.objects%>\r\n',
132 '<%@import=from frog.storageerrors import StorageError%>\r\n',
133 '<%\r\n',
134 '\r\n',
135 'import logging\r\n',
136 'log=logging.getLogger("Snakelets.logger")\r\n',
137 '\r\n',
138 '\r\n',
139 'user=self.SessionCtx.user\r\n',
140 'storageEngine=self.SessionCtx.storageEngine\r\n',
141 '\r\n',
142 '\r\n',
143 'def readArticlesFromDate(date, count=None):\r\n',
144 ' entryids=storageEngine.listBlogEntries(date)\r\n',
145 ' entryids.reverse() # descending\r\n',
146 ' if count:\r\n',
147 ' entryids=entryids[:count]\r\n',
148 ' try:\r\n',
149 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
150 ' except StorageError,x:\r\n',
151 ' log.error("Error loading articles: "+str(x))\r\n',
152 ' self.abort("cannot load articles")\r\n',
153 '\r\n',
154 'showdate=None\r\n',
155 '\r\n',
156 'arg=self.Request.getArg()\r\n',
157 'if arg=="today":\r\n',
158 ' #-------------------- TODAY\'S ARTICLES\r\n',
159 ' self.write("<h2>Today\'s articles</h2>")\r\n',
160 ' showdate = frog.util.isodatestr() \r\n',
161 ' entries = readArticlesFromDate(showdate)\r\n',
162 'elif arg=="active":\r\n',
163 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
164 ' self.Yredirect("active.y")\r\n',
165 'elif arg=="login":\r\n',
166 ' #-------------------- LOGIN PAGE redirect\r\n',
167 ' self.Yredirect("login.y")\r\n',
168 'elif arg=="date":\r\n',
169 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
170 ' showdate = self.Request.getParameter("date")\r\n',
171 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
172 ' entries = readArticlesFromDate(showdate)\r\n',
173 'else:\r\n',
174 ' #-------------------- RECENT ARTICLES\r\n',
175 ' self.write("<h2>Recent articles</h2>")\r\n',
176 ' dates=storageEngine.listBlogEntryDates()\r\n',
177 ' if dates:\r\n',
178 ' entries=[]\r\n',
179 ' SHOWAMOUNT=10\r\n',
180 ' for showdate in dates:\r\n',
181 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
182 ' if len(entries)>=SHOWAMOUNT:\r\n',
183 ' break\r\n',
184 ' \r\n',
185 ]
186 stream = StringIO.StringIO("".join(s).encode(self.encoding))
187 reader = codecs.getreader(self.encoding)(stream)
188 for (i, line) in enumerate(reader):
189 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000190
191 def test_readlinequeue(self):
192 q = Queue()
193 writer = codecs.getwriter(self.encoding)(q)
194 reader = codecs.getreader(self.encoding)(q)
195
196 # No lineends
197 writer.write(u"foo\r")
198 self.assertEqual(reader.readline(keepends=False), u"foo")
199 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000200 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000201 self.assertEqual(reader.readline(keepends=False), u"bar")
202 writer.write(u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"baz")
204 self.assertEqual(reader.readline(keepends=False), u"")
205
206 # Lineends
207 writer.write(u"foo\r")
208 self.assertEqual(reader.readline(keepends=True), u"foo\r")
209 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000210 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000211 self.assertEqual(reader.readline(keepends=True), u"bar\r")
212 writer.write(u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"baz")
214 self.assertEqual(reader.readline(keepends=True), u"")
215 writer.write(u"foo\r\n")
216 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
217
Walter Dörwald9fa09462005-01-10 12:01:39 +0000218 def test_bug1098990_a(self):
219 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
220 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
221 s3 = u"next line.\r\n"
222
223 s = (s1+s2+s3).encode(self.encoding)
224 stream = StringIO.StringIO(s)
225 reader = codecs.getreader(self.encoding)(stream)
226 self.assertEqual(reader.readline(), s1)
227 self.assertEqual(reader.readline(), s2)
228 self.assertEqual(reader.readline(), s3)
229 self.assertEqual(reader.readline(), u"")
230
231 def test_bug1098990_b(self):
232 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
233 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
234 s3 = u"stillokay:bbbbxx\r\n"
235 s4 = u"broken!!!!badbad\r\n"
236 s5 = u"againokay.\r\n"
237
238 s = (s1+s2+s3+s4+s5).encode(self.encoding)
239 stream = StringIO.StringIO(s)
240 reader = codecs.getreader(self.encoding)(stream)
241 self.assertEqual(reader.readline(), s1)
242 self.assertEqual(reader.readline(), s2)
243 self.assertEqual(reader.readline(), s3)
244 self.assertEqual(reader.readline(), s4)
245 self.assertEqual(reader.readline(), s5)
246 self.assertEqual(reader.readline(), u"")
247
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000248class UTF16Test(ReadTest):
249 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000250
251 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
252 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
253
254 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000255 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000256 # encode some stream
257 s = StringIO.StringIO()
258 f = writer(s)
259 f.write(u"spam")
260 f.write(u"spam")
261 d = s.getvalue()
262 # check whether there is exactly one BOM in it
263 self.assert_(d == self.spamle or d == self.spambe)
264 # try to read it back
265 s = StringIO.StringIO(d)
266 f = reader(s)
267 self.assertEquals(f.read(), u"spamspam")
268
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000269 def test_badbom(self):
270 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000271 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000272 self.assertRaises(UnicodeError, f.read)
273
274 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000275 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000276 self.assertRaises(UnicodeError, f.read)
277
Walter Dörwald69652032004-09-07 20:24:22 +0000278 def test_partial(self):
279 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000280 u"\x00\xff\u0100\uffff",
281 [
282 u"", # first byte of BOM read
283 u"", # second byte of BOM read => byteorder known
284 u"",
285 u"\x00",
286 u"\x00",
287 u"\x00\xff",
288 u"\x00\xff",
289 u"\x00\xff\u0100",
290 u"\x00\xff\u0100",
291 u"\x00\xff\u0100\uffff",
292 ]
293 )
294
Walter Dörwalde22d3392005-11-17 08:52:34 +0000295 def test_errors(self):
296 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
297
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000298class UTF16LETest(ReadTest):
299 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000300
301 def test_partial(self):
302 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000303 u"\x00\xff\u0100\uffff",
304 [
305 u"",
306 u"\x00",
307 u"\x00",
308 u"\x00\xff",
309 u"\x00\xff",
310 u"\x00\xff\u0100",
311 u"\x00\xff\u0100",
312 u"\x00\xff\u0100\uffff",
313 ]
314 )
315
Walter Dörwalde22d3392005-11-17 08:52:34 +0000316 def test_errors(self):
317 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
318
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000319class UTF16BETest(ReadTest):
320 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000321
322 def test_partial(self):
323 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000324 u"\x00\xff\u0100\uffff",
325 [
326 u"",
327 u"\x00",
328 u"\x00",
329 u"\x00\xff",
330 u"\x00\xff",
331 u"\x00\xff\u0100",
332 u"\x00\xff\u0100",
333 u"\x00\xff\u0100\uffff",
334 ]
335 )
336
Walter Dörwalde22d3392005-11-17 08:52:34 +0000337 def test_errors(self):
338 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
339
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000340class UTF8Test(ReadTest):
341 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000342
343 def test_partial(self):
344 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000345 u"\x00\xff\u07ff\u0800\uffff",
346 [
347 u"\x00",
348 u"\x00",
349 u"\x00\xff",
350 u"\x00\xff",
351 u"\x00\xff\u07ff",
352 u"\x00\xff\u07ff",
353 u"\x00\xff\u07ff",
354 u"\x00\xff\u07ff\u0800",
355 u"\x00\xff\u07ff\u0800",
356 u"\x00\xff\u07ff\u0800",
357 u"\x00\xff\u07ff\u0800\uffff",
358 ]
359 )
360
Walter Dörwalde22d3392005-11-17 08:52:34 +0000361class UTF7Test(ReadTest):
362 encoding = "utf-7"
363
364 # No test_partial() yet, because UTF-7 doesn't support it.
365
366class UTF16ExTest(unittest.TestCase):
367
368 def test_errors(self):
369 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
370
371 def test_bad_args(self):
372 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
373
374class ReadBufferTest(unittest.TestCase):
375
376 def test_array(self):
377 import array
378 self.assertEqual(
379 codecs.readbuffer_encode(array.array("c", "spam")),
380 ("spam", 4)
381 )
382
383 def test_empty(self):
384 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
385
386 def test_bad_args(self):
387 self.assertRaises(TypeError, codecs.readbuffer_encode)
388 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
389
390class CharBufferTest(unittest.TestCase):
391
392 def test_string(self):
393 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
394
395 def test_empty(self):
396 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
397
398 def test_bad_args(self):
399 self.assertRaises(TypeError, codecs.charbuffer_encode)
400 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
401
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000402class UTF8SigTest(ReadTest):
403 encoding = "utf-8-sig"
404
405 def test_partial(self):
406 self.check_partial(
407 u"\ufeff\x00\xff\u07ff\u0800\uffff",
408 [
409 u"",
410 u"",
411 u"", # First BOM has been read and skipped
412 u"",
413 u"",
414 u"\ufeff", # Second BOM has been read and emitted
415 u"\ufeff\x00", # "\x00" read and emitted
416 u"\ufeff\x00", # First byte of encoded u"\xff" read
417 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
418 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
419 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
420 u"\ufeff\x00\xff\u07ff",
421 u"\ufeff\x00\xff\u07ff",
422 u"\ufeff\x00\xff\u07ff\u0800",
423 u"\ufeff\x00\xff\u07ff\u0800",
424 u"\ufeff\x00\xff\u07ff\u0800",
425 u"\ufeff\x00\xff\u07ff\u0800\uffff",
426 ]
427 )
428
Walter Dörwald9ff1d392006-11-23 05:06:31 +0000429 def test_bug1601501(self):
430 # SF bug #1601501: check that the codec works with a buffer
431 unicode("\xef\xbb\xbf", "utf-8-sig")
432
Walter Dörwald93a36032007-04-21 10:31:43 +0000433 def test_bom(self):
434 d = codecs.getincrementaldecoder("utf-8-sig")()
435 s = u"spam"
436 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
437
Walter Dörwaldf4815982007-11-19 12:43:39 +0000438 def test_stream_bom(self):
439 unistring = u"ABC\u00A1\u2200XYZ"
440 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
441
442 reader = codecs.getreader("utf-8-sig")
443 for sizehint in [None] + range(1, 11) + \
444 [64, 128, 256, 512, 1024]:
445 istream = reader(StringIO.StringIO(bytestring))
446 ostream = StringIO.StringIO()
447 while 1:
448 if sizehint is not None:
449 data = istream.read(sizehint)
450 else:
451 data = istream.read()
452
453 if not data:
454 break
455 ostream.write(data)
456
457 got = ostream.getvalue()
458 self.assertEqual(got, unistring)
459
460 def test_stream_bare(self):
461 unistring = u"ABC\u00A1\u2200XYZ"
462 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
463
464 reader = codecs.getreader("utf-8-sig")
465 for sizehint in [None] + range(1, 11) + \
466 [64, 128, 256, 512, 1024]:
467 istream = reader(StringIO.StringIO(bytestring))
468 ostream = StringIO.StringIO()
469 while 1:
470 if sizehint is not None:
471 data = istream.read(sizehint)
472 else:
473 data = istream.read()
474
475 if not data:
476 break
477 ostream.write(data)
478
479 got = ostream.getvalue()
480 self.assertEqual(got, unistring)
481
Walter Dörwald8709a422002-09-03 13:53:40 +0000482class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000483 def test_empty(self):
Walter Dörwald8709a422002-09-03 13:53:40 +0000484 self.assertEquals(codecs.escape_decode(""), ("", 0))
485
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000486class RecodingTest(unittest.TestCase):
487 def test_recoding(self):
488 f = StringIO.StringIO()
489 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
490 f2.write(u"a")
491 f2.close()
492 # Python used to crash on this at exit because of a refcount
493 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000494
Martin v. Löwis2548c732003-04-18 10:39:54 +0000495# From RFC 3492
496punycode_testcases = [
497 # A Arabic (Egyptian):
498 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
499 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
500 "egbpdaj6bu4bxfgehfvwxn"),
501 # B Chinese (simplified):
502 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
503 "ihqwcrb4cv8a8dqg056pqjye"),
504 # C Chinese (traditional):
505 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
506 "ihqwctvzc91f659drss3x8bo0yb"),
507 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
508 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
509 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
510 u"\u0065\u0073\u006B\u0079",
511 "Proprostnemluvesky-uyb24dma41a"),
512 # E Hebrew:
513 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
514 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
515 u"\u05D1\u05E8\u05D9\u05EA",
516 "4dbcagdahymbxekheh6e0a7fei0b"),
517 # F Hindi (Devanagari):
518 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
519 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
520 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
521 u"\u0939\u0948\u0902",
522 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
523
524 #(G) Japanese (kanji and hiragana):
525 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
526 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
527 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
528
529 # (H) Korean (Hangul syllables):
530 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
531 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
532 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
533 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
534 "psd879ccm6fea98c"),
535
536 # (I) Russian (Cyrillic):
537 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
538 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
539 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
540 u"\u0438",
541 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
542
543 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
544 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
545 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
546 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
547 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
548 u"\u0061\u00F1\u006F\u006C",
549 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
550
551 # (K) Vietnamese:
552 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
553 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
554 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
555 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
556 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
557 u"\u0056\u0069\u1EC7\u0074",
558 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
559
Martin v. Löwis2548c732003-04-18 10:39:54 +0000560 #(L) 3<nen>B<gumi><kinpachi><sensei>
561 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
562 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000563
Martin v. Löwis2548c732003-04-18 10:39:54 +0000564 # (M) <amuro><namie>-with-SUPER-MONKEYS
565 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
566 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
567 u"\u004F\u004E\u004B\u0045\u0059\u0053",
568 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
569
570 # (N) Hello-Another-Way-<sorezore><no><basho>
571 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
572 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
573 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
574 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
575
576 # (O) <hitotsu><yane><no><shita>2
577 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
578 "2-u9tlzr9756bt3uc0v"),
579
580 # (P) Maji<de>Koi<suru>5<byou><mae>
581 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
582 u"\u308B\u0035\u79D2\u524D",
583 "MajiKoi5-783gue6qz075azm5e"),
584
585 # (Q) <pafii>de<runba>
586 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
587 "de-jg4avhby1noc0d"),
588
589 # (R) <sono><supiido><de>
590 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
591 "d9juau41awczczp"),
592
593 # (S) -> $1.00 <-
594 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
595 u"\u003C\u002D",
596 "-> $1.00 <--")
597 ]
598
599for i in punycode_testcases:
600 if len(i)!=2:
601 print repr(i)
602
603class PunycodeTest(unittest.TestCase):
604 def test_encode(self):
605 for uni, puny in punycode_testcases:
606 # Need to convert both strings to lower case, since
607 # some of the extended encodings use upper case, but our
608 # code produces only lower case. Converting just puny to
609 # lower is also insufficient, since some of the input characters
610 # are upper case.
611 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
612
613 def test_decode(self):
614 for uni, puny in punycode_testcases:
615 self.assertEquals(uni, puny.decode("punycode"))
616
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000617class UnicodeInternalTest(unittest.TestCase):
618 def test_bug1251300(self):
619 # Decoding with unicode_internal used to not correctly handle "code
620 # points" above 0x10ffff on UCS-4 builds.
621 if sys.maxunicode > 0xffff:
622 ok = [
623 ("\x00\x10\xff\xff", u"\U0010ffff"),
624 ("\x00\x00\x01\x01", u"\U00000101"),
625 ("", u""),
626 ]
627 not_ok = [
628 "\x7f\xff\xff\xff",
629 "\x80\x00\x00\x00",
630 "\x81\x00\x00\x00",
631 "\x00",
632 "\x00\x00\x00\x00\x00",
633 ]
634 for internal, uni in ok:
635 if sys.byteorder == "little":
636 internal = "".join(reversed(internal))
637 self.assertEquals(uni, internal.decode("unicode_internal"))
638 for internal in not_ok:
639 if sys.byteorder == "little":
640 internal = "".join(reversed(internal))
641 self.assertRaises(UnicodeDecodeError, internal.decode,
642 "unicode_internal")
643
644 def test_decode_error_attributes(self):
645 if sys.maxunicode > 0xffff:
646 try:
647 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
648 except UnicodeDecodeError, ex:
649 self.assertEquals("unicode_internal", ex.encoding)
650 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
651 self.assertEquals(4, ex.start)
652 self.assertEquals(8, ex.end)
653 else:
654 self.fail()
655
656 def test_decode_callback(self):
657 if sys.maxunicode > 0xffff:
658 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
659 decoder = codecs.getdecoder("unicode_internal")
660 ab = u"ab".encode("unicode_internal")
661 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
662 "UnicodeInternalTest")
663 self.assertEquals((u"ab", 12), ignored)
664
Martin v. Löwis2548c732003-04-18 10:39:54 +0000665# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
666nameprep_tests = [
667 # 3.1 Map to nothing.
668 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
669 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
670 '\xb8\x8f\xef\xbb\xbf',
671 'foobarbaz'),
672 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
673 ('CAFE',
674 'cafe'),
675 # 3.3 Case folding 8bit U+00DF (german sharp s).
676 # The original test case is bogus; it says \xc3\xdf
677 ('\xc3\x9f',
678 'ss'),
679 # 3.4 Case folding U+0130 (turkish capital I with dot).
680 ('\xc4\xb0',
681 'i\xcc\x87'),
682 # 3.5 Case folding multibyte U+0143 U+037A.
683 ('\xc5\x83\xcd\xba',
684 '\xc5\x84 \xce\xb9'),
685 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
686 # XXX: skip this as it fails in UCS-2 mode
687 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
688 # 'telc\xe2\x88\x95kg\xcf\x83'),
689 (None, None),
690 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
691 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
692 '\xc7\xb0 a'),
693 # 3.8 Case folding U+1FB7 and normalization.
694 ('\xe1\xbe\xb7',
695 '\xe1\xbe\xb6\xce\xb9'),
696 # 3.9 Self-reverting case folding U+01F0 and normalization.
697 # The original test case is bogus, it says `\xc7\xf0'
698 ('\xc7\xb0',
699 '\xc7\xb0'),
700 # 3.10 Self-reverting case folding U+0390 and normalization.
701 ('\xce\x90',
702 '\xce\x90'),
703 # 3.11 Self-reverting case folding U+03B0 and normalization.
704 ('\xce\xb0',
705 '\xce\xb0'),
706 # 3.12 Self-reverting case folding U+1E96 and normalization.
707 ('\xe1\xba\x96',
708 '\xe1\xba\x96'),
709 # 3.13 Self-reverting case folding U+1F56 and normalization.
710 ('\xe1\xbd\x96',
711 '\xe1\xbd\x96'),
712 # 3.14 ASCII space character U+0020.
713 (' ',
714 ' '),
715 # 3.15 Non-ASCII 8bit space character U+00A0.
716 ('\xc2\xa0',
717 ' '),
718 # 3.16 Non-ASCII multibyte space character U+1680.
719 ('\xe1\x9a\x80',
720 None),
721 # 3.17 Non-ASCII multibyte space character U+2000.
722 ('\xe2\x80\x80',
723 ' '),
724 # 3.18 Zero Width Space U+200b.
725 ('\xe2\x80\x8b',
726 ''),
727 # 3.19 Non-ASCII multibyte space character U+3000.
728 ('\xe3\x80\x80',
729 ' '),
730 # 3.20 ASCII control characters U+0010 U+007F.
731 ('\x10\x7f',
732 '\x10\x7f'),
733 # 3.21 Non-ASCII 8bit control character U+0085.
734 ('\xc2\x85',
735 None),
736 # 3.22 Non-ASCII multibyte control character U+180E.
737 ('\xe1\xa0\x8e',
738 None),
739 # 3.23 Zero Width No-Break Space U+FEFF.
740 ('\xef\xbb\xbf',
741 ''),
742 # 3.24 Non-ASCII control character U+1D175.
743 ('\xf0\x9d\x85\xb5',
744 None),
745 # 3.25 Plane 0 private use character U+F123.
746 ('\xef\x84\xa3',
747 None),
748 # 3.26 Plane 15 private use character U+F1234.
749 ('\xf3\xb1\x88\xb4',
750 None),
751 # 3.27 Plane 16 private use character U+10F234.
752 ('\xf4\x8f\x88\xb4',
753 None),
754 # 3.28 Non-character code point U+8FFFE.
755 ('\xf2\x8f\xbf\xbe',
756 None),
757 # 3.29 Non-character code point U+10FFFF.
758 ('\xf4\x8f\xbf\xbf',
759 None),
760 # 3.30 Surrogate code U+DF42.
761 ('\xed\xbd\x82',
762 None),
763 # 3.31 Non-plain text character U+FFFD.
764 ('\xef\xbf\xbd',
765 None),
766 # 3.32 Ideographic description character U+2FF5.
767 ('\xe2\xbf\xb5',
768 None),
769 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000770 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000771 '\xcc\x81'),
772 # 3.34 Left-to-right mark U+200E.
773 ('\xe2\x80\x8e',
774 None),
775 # 3.35 Deprecated U+202A.
776 ('\xe2\x80\xaa',
777 None),
778 # 3.36 Language tagging character U+E0001.
779 ('\xf3\xa0\x80\x81',
780 None),
781 # 3.37 Language tagging character U+E0042.
782 ('\xf3\xa0\x81\x82',
783 None),
784 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
785 ('foo\xd6\xbebar',
786 None),
787 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
788 ('foo\xef\xb5\x90bar',
789 None),
790 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
791 ('foo\xef\xb9\xb6bar',
792 'foo \xd9\x8ebar'),
793 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
794 ('\xd8\xa71',
795 None),
796 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
797 ('\xd8\xa71\xd8\xa8',
798 '\xd8\xa71\xd8\xa8'),
799 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000800 # Skip this test as we allow unassigned
801 #('\xf3\xa0\x80\x82',
802 # None),
803 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000804 # 3.44 Larger test (shrinking).
805 # Original test case reads \xc3\xdf
806 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
807 '\xaa\xce\xb0\xe2\x80\x80',
808 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
809 # 3.45 Larger test (expanding).
810 # Original test case reads \xc3\x9f
811 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
812 '\x80',
813 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
814 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
815 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
816 ]
817
818
819class NameprepTest(unittest.TestCase):
820 def test_nameprep(self):
821 from encodings.idna import nameprep
822 for pos, (orig, prepped) in enumerate(nameprep_tests):
823 if orig is None:
824 # Skipped
825 continue
826 # The Unicode strings are given in UTF-8
827 orig = unicode(orig, "utf-8")
828 if prepped is None:
829 # Input contains prohibited characters
830 self.assertRaises(UnicodeError, nameprep, orig)
831 else:
832 prepped = unicode(prepped, "utf-8")
833 try:
834 self.assertEquals(nameprep(orig), prepped)
835 except Exception,e:
836 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
837
Walter Dörwald78a0be62006-04-14 18:25:39 +0000838class IDNACodecTest(unittest.TestCase):
839 def test_builtin_decode(self):
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000840 self.assertEquals(unicode("python.org", "idna"), u"python.org")
Walter Dörwald78a0be62006-04-14 18:25:39 +0000841 self.assertEquals(unicode("python.org.", "idna"), u"python.org.")
842 self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
843 self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
844
845 def test_builtin_encode(self):
846 self.assertEquals(u"python.org".encode("idna"), "python.org")
847 self.assertEquals("python.org.".encode("idna"), "python.org.")
848 self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
849 self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000850
Martin v. Löwis8b595142005-08-25 11:03:38 +0000851 def test_stream(self):
852 import StringIO
853 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
854 r.read(3)
855 self.assertEquals(r.read(), u"")
856
Walter Dörwald78a0be62006-04-14 18:25:39 +0000857 def test_incremental_decode(self):
858 self.assertEquals(
859 "".join(codecs.iterdecode("python.org", "idna")),
860 u"python.org"
861 )
862 self.assertEquals(
863 "".join(codecs.iterdecode("python.org.", "idna")),
864 u"python.org."
865 )
866 self.assertEquals(
867 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
868 u"pyth\xf6n.org."
869 )
870 self.assertEquals(
871 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
872 u"pyth\xf6n.org."
873 )
874
875 decoder = codecs.getincrementaldecoder("idna")()
876 self.assertEquals(decoder.decode("xn--xam", ), u"")
877 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
878 self.assertEquals(decoder.decode(u"rg"), u"")
879 self.assertEquals(decoder.decode(u"", True), u"org")
880
881 decoder.reset()
882 self.assertEquals(decoder.decode("xn--xam", ), u"")
883 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
884 self.assertEquals(decoder.decode("rg."), u"org.")
885 self.assertEquals(decoder.decode("", True), u"")
886
887 def test_incremental_encode(self):
888 self.assertEquals(
889 "".join(codecs.iterencode(u"python.org", "idna")),
890 "python.org"
891 )
892 self.assertEquals(
893 "".join(codecs.iterencode(u"python.org.", "idna")),
894 "python.org."
895 )
896 self.assertEquals(
897 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
898 "xn--pythn-mua.org."
899 )
900 self.assertEquals(
901 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
902 "xn--pythn-mua.org."
903 )
904
905 encoder = codecs.getincrementalencoder("idna")()
906 self.assertEquals(encoder.encode(u"\xe4x"), "")
907 self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
908 self.assertEquals(encoder.encode(u"", True), "org")
909
910 encoder.reset()
911 self.assertEquals(encoder.encode(u"\xe4x"), "")
912 self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
913 self.assertEquals(encoder.encode(u"", True), "")
914
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000915class CodecsModuleTest(unittest.TestCase):
916
917 def test_decode(self):
918 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
919 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000920 self.assertRaises(TypeError, codecs.decode)
921 self.assertEquals(codecs.decode('abc'), u'abc')
922 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
923
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000924 def test_encode(self):
925 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
926 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000927 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +0000928 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwald063e1e82004-10-28 13:04:26 +0000929 self.assertEquals(codecs.encode(u'abc'), 'abc')
930 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
931
932 def test_register(self):
933 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +0000934 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +0000935
936 def test_lookup(self):
937 self.assertRaises(TypeError, codecs.lookup)
938 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +0000939 self.assertRaises(LookupError, codecs.lookup, " ")
940
941 def test_getencoder(self):
942 self.assertRaises(TypeError, codecs.getencoder)
943 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
944
945 def test_getdecoder(self):
946 self.assertRaises(TypeError, codecs.getdecoder)
947 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
948
949 def test_getreader(self):
950 self.assertRaises(TypeError, codecs.getreader)
951 self.assertRaises(LookupError, codecs.getreader, "__spam__")
952
953 def test_getwriter(self):
954 self.assertRaises(TypeError, codecs.getwriter)
955 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000956
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000957class StreamReaderTest(unittest.TestCase):
958
959 def setUp(self):
960 self.reader = codecs.getreader('utf-8')
961 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
962
963 def test_readlines(self):
964 f = self.reader(self.stream)
965 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
966
Georg Brandl2a5a3022006-10-29 08:39:27 +0000967class EncodedFileTest(unittest.TestCase):
Neal Norwitz44dab0a2007-04-25 06:42:41 +0000968
Georg Brandl2a5a3022006-10-29 08:39:27 +0000969 def test_basic(self):
970 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandlb8205a12006-10-29 09:32:19 +0000971 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Georg Brandlf96b1622006-10-29 15:22:43 +0000972 self.assertEquals(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl2a5a3022006-10-29 08:39:27 +0000973
974 f = StringIO.StringIO()
975 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
976 ef.write('\xc3\xbc')
977 self.assertEquals(f.getvalue(), '\xfc')
978
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000979class Str2StrTest(unittest.TestCase):
980
981 def test_read(self):
982 sin = "\x80".encode("base64_codec")
983 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
984 sout = reader.read()
985 self.assertEqual(sout, "\x80")
986 self.assert_(isinstance(sout, str))
987
988 def test_readline(self):
989 sin = "\x80".encode("base64_codec")
990 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
991 sout = reader.readline()
992 self.assertEqual(sout, "\x80")
993 self.assert_(isinstance(sout, str))
994
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000995all_unicode_encodings = [
996 "ascii",
997 "base64_codec",
998 "big5",
999 "big5hkscs",
1000 "charmap",
1001 "cp037",
1002 "cp1006",
1003 "cp1026",
1004 "cp1140",
1005 "cp1250",
1006 "cp1251",
1007 "cp1252",
1008 "cp1253",
1009 "cp1254",
1010 "cp1255",
1011 "cp1256",
1012 "cp1257",
1013 "cp1258",
1014 "cp424",
1015 "cp437",
1016 "cp500",
1017 "cp737",
1018 "cp775",
1019 "cp850",
1020 "cp852",
1021 "cp855",
1022 "cp856",
1023 "cp857",
1024 "cp860",
1025 "cp861",
1026 "cp862",
1027 "cp863",
1028 "cp864",
1029 "cp865",
1030 "cp866",
1031 "cp869",
1032 "cp874",
1033 "cp875",
1034 "cp932",
1035 "cp949",
1036 "cp950",
1037 "euc_jis_2004",
1038 "euc_jisx0213",
1039 "euc_jp",
1040 "euc_kr",
1041 "gb18030",
1042 "gb2312",
1043 "gbk",
1044 "hex_codec",
1045 "hp_roman8",
1046 "hz",
1047 "idna",
1048 "iso2022_jp",
1049 "iso2022_jp_1",
1050 "iso2022_jp_2",
1051 "iso2022_jp_2004",
1052 "iso2022_jp_3",
1053 "iso2022_jp_ext",
1054 "iso2022_kr",
1055 "iso8859_1",
1056 "iso8859_10",
1057 "iso8859_11",
1058 "iso8859_13",
1059 "iso8859_14",
1060 "iso8859_15",
1061 "iso8859_16",
1062 "iso8859_2",
1063 "iso8859_3",
1064 "iso8859_4",
1065 "iso8859_5",
1066 "iso8859_6",
1067 "iso8859_7",
1068 "iso8859_8",
1069 "iso8859_9",
1070 "johab",
1071 "koi8_r",
1072 "koi8_u",
1073 "latin_1",
1074 "mac_cyrillic",
1075 "mac_greek",
1076 "mac_iceland",
1077 "mac_latin2",
1078 "mac_roman",
1079 "mac_turkish",
1080 "palmos",
1081 "ptcp154",
1082 "punycode",
1083 "raw_unicode_escape",
1084 "rot_13",
1085 "shift_jis",
1086 "shift_jis_2004",
1087 "shift_jisx0213",
1088 "tis_620",
1089 "unicode_escape",
1090 "unicode_internal",
1091 "utf_16",
1092 "utf_16_be",
1093 "utf_16_le",
1094 "utf_7",
1095 "utf_8",
1096]
1097
1098if hasattr(codecs, "mbcs_encode"):
1099 all_unicode_encodings.append("mbcs")
1100
1101# The following encodings work only with str, not unicode
1102all_string_encodings = [
1103 "quopri_codec",
1104 "string_escape",
1105 "uu_codec",
1106]
1107
1108# The following encoding is not tested, because it's not supposed
1109# to work:
1110# "undefined"
1111
1112# The following encodings don't work in stateful mode
1113broken_unicode_with_streams = [
1114 "base64_codec",
1115 "hex_codec",
1116 "punycode",
1117 "unicode_internal"
1118]
Georg Brandlc68d2cc2006-10-29 14:39:13 +00001119broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001120
1121try:
1122 import bz2
1123except ImportError:
1124 pass
1125else:
1126 all_unicode_encodings.append("bz2_codec")
1127 broken_unicode_with_streams.append("bz2_codec")
1128
1129try:
1130 import zlib
1131except ImportError:
1132 pass
1133else:
1134 all_unicode_encodings.append("zlib_codec")
1135 broken_unicode_with_streams.append("zlib_codec")
1136
1137class BasicUnicodeTest(unittest.TestCase):
1138 def test_basics(self):
1139 s = u"abc123" # all codecs should be able to encode these
1140 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001141 name = codecs.lookup(encoding).name
1142 if encoding.endswith("_codec"):
1143 name += "_codec"
1144 elif encoding == "latin_1":
1145 name = "latin_1"
1146 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001147 (bytes, size) = codecs.getencoder(encoding)(s)
1148 if encoding != "unicode_internal":
1149 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1150 (chars, size) = codecs.getdecoder(encoding)(bytes)
1151 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1152
1153 if encoding not in broken_unicode_with_streams:
1154 # check stream reader/writer
1155 q = Queue()
1156 writer = codecs.getwriter(encoding)(q)
1157 encodedresult = ""
1158 for c in s:
1159 writer.write(c)
1160 encodedresult += q.read()
1161 q = Queue()
1162 reader = codecs.getreader(encoding)(q)
1163 decodedresult = u""
1164 for c in encodedresult:
1165 q.write(c)
1166 decodedresult += reader.read()
1167 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1168
Georg Brandlc68d2cc2006-10-29 14:39:13 +00001169 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001170 # check incremental decoder/encoder (fetched via the Python
1171 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001172 try:
1173 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001174 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001175 except LookupError: # no IncrementalEncoder
1176 pass
1177 else:
1178 # check incremental decoder/encoder
1179 encodedresult = ""
1180 for c in s:
1181 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001182 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001183 decoder = codecs.getincrementaldecoder(encoding)()
1184 decodedresult = u""
1185 for c in encodedresult:
1186 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001187 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001188 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1189
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001190 # check C API
1191 encodedresult = ""
1192 for c in s:
1193 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001194 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001195 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1196 decodedresult = u""
1197 for c in encodedresult:
1198 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001199 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001200 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1201
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001202 # check iterencode()/iterdecode()
1203 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1204 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1205
1206 # check iterencode()/iterdecode() with empty string
1207 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1208 self.assertEqual(result, u"")
1209
Walter Dörwald729c31f2005-03-14 19:06:30 +00001210 def test_seek(self):
1211 # all codecs should be able to encode these
1212 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1213 for encoding in all_unicode_encodings:
1214 if encoding == "idna": # FIXME: See SF bug #1163178
1215 continue
1216 if encoding in broken_unicode_with_streams:
1217 continue
1218 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1219 for t in xrange(5):
1220 # Test that calling seek resets the internal codec state and buffers
1221 reader.seek(0, 0)
1222 line = reader.readline()
1223 self.assertEqual(s[:len(line)], line)
1224
Walter Dörwalde22d3392005-11-17 08:52:34 +00001225 def test_bad_decode_args(self):
1226 for encoding in all_unicode_encodings:
1227 decoder = codecs.getdecoder(encoding)
1228 self.assertRaises(TypeError, decoder)
1229 if encoding not in ("idna", "punycode"):
1230 self.assertRaises(TypeError, decoder, 42)
1231
1232 def test_bad_encode_args(self):
1233 for encoding in all_unicode_encodings:
1234 encoder = codecs.getencoder(encoding)
1235 self.assertRaises(TypeError, encoder)
1236
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001237 def test_encoding_map_type_initialized(self):
1238 from encodings import cp1140
1239 # This used to crash, we are only verifying there's no crash.
1240 table_type = type(cp1140.encoding_table)
1241 self.assertEqual(table_type, table_type)
1242
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001243class BasicStrTest(unittest.TestCase):
1244 def test_basics(self):
1245 s = "abc123"
1246 for encoding in all_string_encodings:
1247 (bytes, size) = codecs.getencoder(encoding)(s)
1248 self.assertEqual(size, len(s))
1249 (chars, size) = codecs.getdecoder(encoding)(bytes)
1250 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1251
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001252class CharmapTest(unittest.TestCase):
1253 def test_decode_with_string_map(self):
1254 self.assertEquals(
1255 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1256 (u"abc", 3)
1257 )
1258
1259 self.assertEquals(
1260 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1261 (u"ab\ufffd", 3)
1262 )
1263
1264 self.assertEquals(
1265 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1266 (u"ab\ufffd", 3)
1267 )
1268
1269 self.assertEquals(
1270 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1271 (u"ab", 3)
1272 )
1273
1274 self.assertEquals(
1275 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1276 (u"ab", 3)
1277 )
1278
1279 allbytes = "".join(chr(i) for i in xrange(256))
1280 self.assertEquals(
1281 codecs.charmap_decode(allbytes, "ignore", u""),
1282 (u"", len(allbytes))
1283 )
1284
Georg Brandl2a5a3022006-10-29 08:39:27 +00001285class WithStmtTest(unittest.TestCase):
1286 def test_encodedfile(self):
1287 f = StringIO.StringIO("\xc3\xbc")
1288 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1289 self.assertEquals(ef.read(), "\xfc")
1290
1291 def test_streamreaderwriter(self):
1292 f = StringIO.StringIO("\xc3\xbc")
1293 info = codecs.lookup("utf-8")
1294 with codecs.StreamReaderWriter(f, info.streamreader,
1295 info.streamwriter, 'strict') as srw:
1296 self.assertEquals(srw.read(), u"\xfc")
1297
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001298
Fred Drake2e2be372001-09-20 21:33:42 +00001299def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001300 test_support.run_unittest(
1301 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001302 UTF16LETest,
1303 UTF16BETest,
1304 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001305 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001306 UTF7Test,
1307 UTF16ExTest,
1308 ReadBufferTest,
1309 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001310 EscapeDecodeTest,
1311 RecodingTest,
1312 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001313 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001314 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00001315 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001316 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001317 StreamReaderTest,
Georg Brandl2a5a3022006-10-29 08:39:27 +00001318 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001319 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001320 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001321 BasicStrTest,
Georg Brandl2a5a3022006-10-29 08:39:27 +00001322 CharmapTest,
1323 WithStmtTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001324 )
Fred Drake2e2be372001-09-20 21:33:42 +00001325
1326
1327if __name__ == "__main__":
1328 test_main()