blob: 6d6e37897daa389847a0f9eb5c7589d9673b67f1 [file] [log] [blame]
Georg Brandl2a5a3022006-10-29 08:39:27 +00001from __future__ import with_statement
Barry Warsaw04f357c2002-07-23 19:04:11 +00002from test import test_support
3import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00004import codecs
Walter Dörwald9ae019b2006-03-18 14:22:26 +00005import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Walter Dörwald69652032004-09-07 20:24:22 +00007class Queue(object):
8 """
9 queue: write bytes at one end, read bytes from the other end
10 """
11 def __init__(self):
12 self._buffer = ""
13
14 def write(self, chars):
15 self._buffer += chars
16
17 def read(self, size=-1):
18 if size<0:
19 s = self._buffer
20 self._buffer = ""
21 return s
22 else:
23 s = self._buffer[:size]
24 self._buffer = self._buffer[size:]
25 return s
26
Walter Dörwalde57d7b12004-12-21 22:24:00 +000027class ReadTest(unittest.TestCase):
28 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000029 # get a StreamReader for the encoding and feed the bytestring version
30 # of input to the reader byte by byte. Read every available from
31 # the StreamReader and check that the results equal the appropriate
32 # entries from partialresults.
33 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000034 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000035 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000036 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000037 q.write(c)
38 result += r.read()
39 self.assertEqual(result, partialresult)
40 # check that there's nothing left in the buffers
41 self.assertEqual(r.read(), u"")
42 self.assertEqual(r.bytebuffer, "")
43 self.assertEqual(r.charbuffer, u"")
44
Walter Dörwaldabb02e52006-03-15 11:35:15 +000045 # do the check again, this time using a incremental decoder
46 d = codecs.getincrementaldecoder(self.encoding)()
47 result = u""
48 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
49 result += d.decode(c)
50 self.assertEqual(result, partialresult)
51 # check that there's nothing left in the buffers
52 self.assertEqual(d.decode("", True), u"")
53 self.assertEqual(d.buffer, "")
54
55 # Check whether the rest method works properly
56 d.reset()
57 result = u""
58 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
59 result += d.decode(c)
60 self.assertEqual(result, partialresult)
61 # check that there's nothing left in the buffers
62 self.assertEqual(d.decode("", True), u"")
63 self.assertEqual(d.buffer, "")
64
65 # check iterdecode()
66 encoded = input.encode(self.encoding)
67 self.assertEqual(
68 input,
69 u"".join(codecs.iterdecode(encoded, self.encoding))
70 )
71
Walter Dörwalde57d7b12004-12-21 22:24:00 +000072 def test_readline(self):
73 def getreader(input):
74 stream = StringIO.StringIO(input.encode(self.encoding))
75 return codecs.getreader(self.encoding)(stream)
76
Walter Dörwaldca199432006-03-06 22:39:12 +000077 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000078 reader = getreader(input)
79 lines = []
80 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000081 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000082 if not line:
83 break
84 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000085 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000086
87 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000088 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
89 sexpectednoends = u"foo|bar|baz|spam|eggs"
90 self.assertEqual(readalllines(s, True), sexpected)
91 self.assertEqual(readalllines(s, False), sexpectednoends)
92 self.assertEqual(readalllines(s, True, 10), sexpected)
93 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000094
95 # Test long lines (multiple calls to read() in readline())
96 vw = []
97 vwo = []
98 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
99 vw.append((i*200)*u"\3042" + lineend)
100 vwo.append((i*200)*u"\3042")
101 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
102 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
103
104 # Test lines where the first read might end with \r, so the
105 # reader has to look ahead whether this is a lone \r or a \r\n
106 for size in xrange(80):
107 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000108 s = 10*(size*u"a" + lineend + u"xxx\n")
109 reader = getreader(s)
110 for i in xrange(10):
111 self.assertEqual(
112 reader.readline(keepends=True),
113 size*u"a" + lineend,
114 )
115 reader = getreader(s)
116 for i in xrange(10):
117 self.assertEqual(
118 reader.readline(keepends=False),
119 size*u"a",
120 )
121
122 def test_bug1175396(self):
123 s = [
124 '<%!--===================================================\r\n',
125 ' BLOG index page: show recent articles,\r\n',
126 ' today\'s articles, or articles of a specific date.\r\n',
127 '========================================================--%>\r\n',
128 '<%@inputencoding="ISO-8859-1"%>\r\n',
129 '<%@pagetemplate=TEMPLATE.y%>\r\n',
130 '<%@import=import frog.util, frog%>\r\n',
131 '<%@import=import frog.objects%>\r\n',
132 '<%@import=from frog.storageerrors import StorageError%>\r\n',
133 '<%\r\n',
134 '\r\n',
135 'import logging\r\n',
136 'log=logging.getLogger("Snakelets.logger")\r\n',
137 '\r\n',
138 '\r\n',
139 'user=self.SessionCtx.user\r\n',
140 'storageEngine=self.SessionCtx.storageEngine\r\n',
141 '\r\n',
142 '\r\n',
143 'def readArticlesFromDate(date, count=None):\r\n',
144 ' entryids=storageEngine.listBlogEntries(date)\r\n',
145 ' entryids.reverse() # descending\r\n',
146 ' if count:\r\n',
147 ' entryids=entryids[:count]\r\n',
148 ' try:\r\n',
149 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
150 ' except StorageError,x:\r\n',
151 ' log.error("Error loading articles: "+str(x))\r\n',
152 ' self.abort("cannot load articles")\r\n',
153 '\r\n',
154 'showdate=None\r\n',
155 '\r\n',
156 'arg=self.Request.getArg()\r\n',
157 'if arg=="today":\r\n',
158 ' #-------------------- TODAY\'S ARTICLES\r\n',
159 ' self.write("<h2>Today\'s articles</h2>")\r\n',
160 ' showdate = frog.util.isodatestr() \r\n',
161 ' entries = readArticlesFromDate(showdate)\r\n',
162 'elif arg=="active":\r\n',
163 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
164 ' self.Yredirect("active.y")\r\n',
165 'elif arg=="login":\r\n',
166 ' #-------------------- LOGIN PAGE redirect\r\n',
167 ' self.Yredirect("login.y")\r\n',
168 'elif arg=="date":\r\n',
169 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
170 ' showdate = self.Request.getParameter("date")\r\n',
171 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
172 ' entries = readArticlesFromDate(showdate)\r\n',
173 'else:\r\n',
174 ' #-------------------- RECENT ARTICLES\r\n',
175 ' self.write("<h2>Recent articles</h2>")\r\n',
176 ' dates=storageEngine.listBlogEntryDates()\r\n',
177 ' if dates:\r\n',
178 ' entries=[]\r\n',
179 ' SHOWAMOUNT=10\r\n',
180 ' for showdate in dates:\r\n',
181 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
182 ' if len(entries)>=SHOWAMOUNT:\r\n',
183 ' break\r\n',
184 ' \r\n',
185 ]
186 stream = StringIO.StringIO("".join(s).encode(self.encoding))
187 reader = codecs.getreader(self.encoding)(stream)
188 for (i, line) in enumerate(reader):
189 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000190
191 def test_readlinequeue(self):
192 q = Queue()
193 writer = codecs.getwriter(self.encoding)(q)
194 reader = codecs.getreader(self.encoding)(q)
195
196 # No lineends
197 writer.write(u"foo\r")
198 self.assertEqual(reader.readline(keepends=False), u"foo")
199 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000200 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000201 self.assertEqual(reader.readline(keepends=False), u"bar")
202 writer.write(u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"baz")
204 self.assertEqual(reader.readline(keepends=False), u"")
205
206 # Lineends
207 writer.write(u"foo\r")
208 self.assertEqual(reader.readline(keepends=True), u"foo\r")
209 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000210 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000211 self.assertEqual(reader.readline(keepends=True), u"bar\r")
212 writer.write(u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"baz")
214 self.assertEqual(reader.readline(keepends=True), u"")
215 writer.write(u"foo\r\n")
216 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
217
Walter Dörwald9fa09462005-01-10 12:01:39 +0000218 def test_bug1098990_a(self):
219 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
220 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
221 s3 = u"next line.\r\n"
222
223 s = (s1+s2+s3).encode(self.encoding)
224 stream = StringIO.StringIO(s)
225 reader = codecs.getreader(self.encoding)(stream)
226 self.assertEqual(reader.readline(), s1)
227 self.assertEqual(reader.readline(), s2)
228 self.assertEqual(reader.readline(), s3)
229 self.assertEqual(reader.readline(), u"")
230
231 def test_bug1098990_b(self):
232 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
233 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
234 s3 = u"stillokay:bbbbxx\r\n"
235 s4 = u"broken!!!!badbad\r\n"
236 s5 = u"againokay.\r\n"
237
238 s = (s1+s2+s3+s4+s5).encode(self.encoding)
239 stream = StringIO.StringIO(s)
240 reader = codecs.getreader(self.encoding)(stream)
241 self.assertEqual(reader.readline(), s1)
242 self.assertEqual(reader.readline(), s2)
243 self.assertEqual(reader.readline(), s3)
244 self.assertEqual(reader.readline(), s4)
245 self.assertEqual(reader.readline(), s5)
246 self.assertEqual(reader.readline(), u"")
247
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000248class UTF16Test(ReadTest):
249 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000250
251 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
252 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
253
254 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000255 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000256 # encode some stream
257 s = StringIO.StringIO()
258 f = writer(s)
259 f.write(u"spam")
260 f.write(u"spam")
261 d = s.getvalue()
262 # check whether there is exactly one BOM in it
263 self.assert_(d == self.spamle or d == self.spambe)
264 # try to read it back
265 s = StringIO.StringIO(d)
266 f = reader(s)
267 self.assertEquals(f.read(), u"spamspam")
268
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000269 def test_badbom(self):
270 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000271 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000272 self.assertRaises(UnicodeError, f.read)
273
274 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000275 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000276 self.assertRaises(UnicodeError, f.read)
277
Walter Dörwald69652032004-09-07 20:24:22 +0000278 def test_partial(self):
279 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000280 u"\x00\xff\u0100\uffff",
281 [
282 u"", # first byte of BOM read
283 u"", # second byte of BOM read => byteorder known
284 u"",
285 u"\x00",
286 u"\x00",
287 u"\x00\xff",
288 u"\x00\xff",
289 u"\x00\xff\u0100",
290 u"\x00\xff\u0100",
291 u"\x00\xff\u0100\uffff",
292 ]
293 )
294
Walter Dörwalde22d3392005-11-17 08:52:34 +0000295 def test_errors(self):
296 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
297
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000298class UTF16LETest(ReadTest):
299 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000300
301 def test_partial(self):
302 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000303 u"\x00\xff\u0100\uffff",
304 [
305 u"",
306 u"\x00",
307 u"\x00",
308 u"\x00\xff",
309 u"\x00\xff",
310 u"\x00\xff\u0100",
311 u"\x00\xff\u0100",
312 u"\x00\xff\u0100\uffff",
313 ]
314 )
315
Walter Dörwalde22d3392005-11-17 08:52:34 +0000316 def test_errors(self):
317 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
318
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000319class UTF16BETest(ReadTest):
320 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000321
322 def test_partial(self):
323 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000324 u"\x00\xff\u0100\uffff",
325 [
326 u"",
327 u"\x00",
328 u"\x00",
329 u"\x00\xff",
330 u"\x00\xff",
331 u"\x00\xff\u0100",
332 u"\x00\xff\u0100",
333 u"\x00\xff\u0100\uffff",
334 ]
335 )
336
Walter Dörwalde22d3392005-11-17 08:52:34 +0000337 def test_errors(self):
338 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
339
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000340class UTF8Test(ReadTest):
341 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000342
343 def test_partial(self):
344 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000345 u"\x00\xff\u07ff\u0800\uffff",
346 [
347 u"\x00",
348 u"\x00",
349 u"\x00\xff",
350 u"\x00\xff",
351 u"\x00\xff\u07ff",
352 u"\x00\xff\u07ff",
353 u"\x00\xff\u07ff",
354 u"\x00\xff\u07ff\u0800",
355 u"\x00\xff\u07ff\u0800",
356 u"\x00\xff\u07ff\u0800",
357 u"\x00\xff\u07ff\u0800\uffff",
358 ]
359 )
360
Walter Dörwalde22d3392005-11-17 08:52:34 +0000361class UTF7Test(ReadTest):
362 encoding = "utf-7"
363
364 # No test_partial() yet, because UTF-7 doesn't support it.
365
366class UTF16ExTest(unittest.TestCase):
367
368 def test_errors(self):
369 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
370
371 def test_bad_args(self):
372 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
373
374class ReadBufferTest(unittest.TestCase):
375
376 def test_array(self):
377 import array
378 self.assertEqual(
379 codecs.readbuffer_encode(array.array("c", "spam")),
380 ("spam", 4)
381 )
382
383 def test_empty(self):
384 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
385
386 def test_bad_args(self):
387 self.assertRaises(TypeError, codecs.readbuffer_encode)
388 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
389
390class CharBufferTest(unittest.TestCase):
391
392 def test_string(self):
393 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
394
395 def test_empty(self):
396 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
397
398 def test_bad_args(self):
399 self.assertRaises(TypeError, codecs.charbuffer_encode)
400 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
401
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000402class UTF8SigTest(ReadTest):
403 encoding = "utf-8-sig"
404
405 def test_partial(self):
406 self.check_partial(
407 u"\ufeff\x00\xff\u07ff\u0800\uffff",
408 [
409 u"",
410 u"",
411 u"", # First BOM has been read and skipped
412 u"",
413 u"",
414 u"\ufeff", # Second BOM has been read and emitted
415 u"\ufeff\x00", # "\x00" read and emitted
416 u"\ufeff\x00", # First byte of encoded u"\xff" read
417 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
418 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
419 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
420 u"\ufeff\x00\xff\u07ff",
421 u"\ufeff\x00\xff\u07ff",
422 u"\ufeff\x00\xff\u07ff\u0800",
423 u"\ufeff\x00\xff\u07ff\u0800",
424 u"\ufeff\x00\xff\u07ff\u0800",
425 u"\ufeff\x00\xff\u07ff\u0800\uffff",
426 ]
427 )
428
Walter Dörwald9ff1d392006-11-23 05:06:31 +0000429 def test_bug1601501(self):
430 # SF bug #1601501: check that the codec works with a buffer
431 unicode("\xef\xbb\xbf", "utf-8-sig")
432
Walter Dörwald93a36032007-04-21 10:31:43 +0000433 def test_bom(self):
434 d = codecs.getincrementaldecoder("utf-8-sig")()
435 s = u"spam"
436 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
437
Walter Dörwald8709a422002-09-03 13:53:40 +0000438class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000439 def test_empty(self):
Walter Dörwald8709a422002-09-03 13:53:40 +0000440 self.assertEquals(codecs.escape_decode(""), ("", 0))
441
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000442class RecodingTest(unittest.TestCase):
443 def test_recoding(self):
444 f = StringIO.StringIO()
445 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
446 f2.write(u"a")
447 f2.close()
448 # Python used to crash on this at exit because of a refcount
449 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000450
Martin v. Löwis2548c732003-04-18 10:39:54 +0000451# From RFC 3492
452punycode_testcases = [
453 # A Arabic (Egyptian):
454 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
455 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
456 "egbpdaj6bu4bxfgehfvwxn"),
457 # B Chinese (simplified):
458 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
459 "ihqwcrb4cv8a8dqg056pqjye"),
460 # C Chinese (traditional):
461 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
462 "ihqwctvzc91f659drss3x8bo0yb"),
463 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
464 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
465 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
466 u"\u0065\u0073\u006B\u0079",
467 "Proprostnemluvesky-uyb24dma41a"),
468 # E Hebrew:
469 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
470 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
471 u"\u05D1\u05E8\u05D9\u05EA",
472 "4dbcagdahymbxekheh6e0a7fei0b"),
473 # F Hindi (Devanagari):
474 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
475 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
476 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
477 u"\u0939\u0948\u0902",
478 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
479
480 #(G) Japanese (kanji and hiragana):
481 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
482 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
483 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
484
485 # (H) Korean (Hangul syllables):
486 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
487 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
488 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
489 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
490 "psd879ccm6fea98c"),
491
492 # (I) Russian (Cyrillic):
493 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
494 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
495 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
496 u"\u0438",
497 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
498
499 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
500 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
501 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
502 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
503 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
504 u"\u0061\u00F1\u006F\u006C",
505 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
506
507 # (K) Vietnamese:
508 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
509 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
510 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
511 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
512 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
513 u"\u0056\u0069\u1EC7\u0074",
514 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
515
Martin v. Löwis2548c732003-04-18 10:39:54 +0000516 #(L) 3<nen>B<gumi><kinpachi><sensei>
517 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
518 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000519
Martin v. Löwis2548c732003-04-18 10:39:54 +0000520 # (M) <amuro><namie>-with-SUPER-MONKEYS
521 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
522 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
523 u"\u004F\u004E\u004B\u0045\u0059\u0053",
524 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
525
526 # (N) Hello-Another-Way-<sorezore><no><basho>
527 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
528 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
529 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
530 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
531
532 # (O) <hitotsu><yane><no><shita>2
533 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
534 "2-u9tlzr9756bt3uc0v"),
535
536 # (P) Maji<de>Koi<suru>5<byou><mae>
537 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
538 u"\u308B\u0035\u79D2\u524D",
539 "MajiKoi5-783gue6qz075azm5e"),
540
541 # (Q) <pafii>de<runba>
542 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
543 "de-jg4avhby1noc0d"),
544
545 # (R) <sono><supiido><de>
546 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
547 "d9juau41awczczp"),
548
549 # (S) -> $1.00 <-
550 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
551 u"\u003C\u002D",
552 "-> $1.00 <--")
553 ]
554
555for i in punycode_testcases:
556 if len(i)!=2:
557 print repr(i)
558
559class PunycodeTest(unittest.TestCase):
560 def test_encode(self):
561 for uni, puny in punycode_testcases:
562 # Need to convert both strings to lower case, since
563 # some of the extended encodings use upper case, but our
564 # code produces only lower case. Converting just puny to
565 # lower is also insufficient, since some of the input characters
566 # are upper case.
567 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
568
569 def test_decode(self):
570 for uni, puny in punycode_testcases:
571 self.assertEquals(uni, puny.decode("punycode"))
572
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000573class UnicodeInternalTest(unittest.TestCase):
574 def test_bug1251300(self):
575 # Decoding with unicode_internal used to not correctly handle "code
576 # points" above 0x10ffff on UCS-4 builds.
577 if sys.maxunicode > 0xffff:
578 ok = [
579 ("\x00\x10\xff\xff", u"\U0010ffff"),
580 ("\x00\x00\x01\x01", u"\U00000101"),
581 ("", u""),
582 ]
583 not_ok = [
584 "\x7f\xff\xff\xff",
585 "\x80\x00\x00\x00",
586 "\x81\x00\x00\x00",
587 "\x00",
588 "\x00\x00\x00\x00\x00",
589 ]
590 for internal, uni in ok:
591 if sys.byteorder == "little":
592 internal = "".join(reversed(internal))
593 self.assertEquals(uni, internal.decode("unicode_internal"))
594 for internal in not_ok:
595 if sys.byteorder == "little":
596 internal = "".join(reversed(internal))
597 self.assertRaises(UnicodeDecodeError, internal.decode,
598 "unicode_internal")
599
600 def test_decode_error_attributes(self):
601 if sys.maxunicode > 0xffff:
602 try:
603 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
604 except UnicodeDecodeError, ex:
605 self.assertEquals("unicode_internal", ex.encoding)
606 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
607 self.assertEquals(4, ex.start)
608 self.assertEquals(8, ex.end)
609 else:
610 self.fail()
611
612 def test_decode_callback(self):
613 if sys.maxunicode > 0xffff:
614 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
615 decoder = codecs.getdecoder("unicode_internal")
616 ab = u"ab".encode("unicode_internal")
617 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
618 "UnicodeInternalTest")
619 self.assertEquals((u"ab", 12), ignored)
620
Martin v. Löwis2548c732003-04-18 10:39:54 +0000621# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
622nameprep_tests = [
623 # 3.1 Map to nothing.
624 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
625 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
626 '\xb8\x8f\xef\xbb\xbf',
627 'foobarbaz'),
628 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
629 ('CAFE',
630 'cafe'),
631 # 3.3 Case folding 8bit U+00DF (german sharp s).
632 # The original test case is bogus; it says \xc3\xdf
633 ('\xc3\x9f',
634 'ss'),
635 # 3.4 Case folding U+0130 (turkish capital I with dot).
636 ('\xc4\xb0',
637 'i\xcc\x87'),
638 # 3.5 Case folding multibyte U+0143 U+037A.
639 ('\xc5\x83\xcd\xba',
640 '\xc5\x84 \xce\xb9'),
641 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
642 # XXX: skip this as it fails in UCS-2 mode
643 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
644 # 'telc\xe2\x88\x95kg\xcf\x83'),
645 (None, None),
646 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
647 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
648 '\xc7\xb0 a'),
649 # 3.8 Case folding U+1FB7 and normalization.
650 ('\xe1\xbe\xb7',
651 '\xe1\xbe\xb6\xce\xb9'),
652 # 3.9 Self-reverting case folding U+01F0 and normalization.
653 # The original test case is bogus, it says `\xc7\xf0'
654 ('\xc7\xb0',
655 '\xc7\xb0'),
656 # 3.10 Self-reverting case folding U+0390 and normalization.
657 ('\xce\x90',
658 '\xce\x90'),
659 # 3.11 Self-reverting case folding U+03B0 and normalization.
660 ('\xce\xb0',
661 '\xce\xb0'),
662 # 3.12 Self-reverting case folding U+1E96 and normalization.
663 ('\xe1\xba\x96',
664 '\xe1\xba\x96'),
665 # 3.13 Self-reverting case folding U+1F56 and normalization.
666 ('\xe1\xbd\x96',
667 '\xe1\xbd\x96'),
668 # 3.14 ASCII space character U+0020.
669 (' ',
670 ' '),
671 # 3.15 Non-ASCII 8bit space character U+00A0.
672 ('\xc2\xa0',
673 ' '),
674 # 3.16 Non-ASCII multibyte space character U+1680.
675 ('\xe1\x9a\x80',
676 None),
677 # 3.17 Non-ASCII multibyte space character U+2000.
678 ('\xe2\x80\x80',
679 ' '),
680 # 3.18 Zero Width Space U+200b.
681 ('\xe2\x80\x8b',
682 ''),
683 # 3.19 Non-ASCII multibyte space character U+3000.
684 ('\xe3\x80\x80',
685 ' '),
686 # 3.20 ASCII control characters U+0010 U+007F.
687 ('\x10\x7f',
688 '\x10\x7f'),
689 # 3.21 Non-ASCII 8bit control character U+0085.
690 ('\xc2\x85',
691 None),
692 # 3.22 Non-ASCII multibyte control character U+180E.
693 ('\xe1\xa0\x8e',
694 None),
695 # 3.23 Zero Width No-Break Space U+FEFF.
696 ('\xef\xbb\xbf',
697 ''),
698 # 3.24 Non-ASCII control character U+1D175.
699 ('\xf0\x9d\x85\xb5',
700 None),
701 # 3.25 Plane 0 private use character U+F123.
702 ('\xef\x84\xa3',
703 None),
704 # 3.26 Plane 15 private use character U+F1234.
705 ('\xf3\xb1\x88\xb4',
706 None),
707 # 3.27 Plane 16 private use character U+10F234.
708 ('\xf4\x8f\x88\xb4',
709 None),
710 # 3.28 Non-character code point U+8FFFE.
711 ('\xf2\x8f\xbf\xbe',
712 None),
713 # 3.29 Non-character code point U+10FFFF.
714 ('\xf4\x8f\xbf\xbf',
715 None),
716 # 3.30 Surrogate code U+DF42.
717 ('\xed\xbd\x82',
718 None),
719 # 3.31 Non-plain text character U+FFFD.
720 ('\xef\xbf\xbd',
721 None),
722 # 3.32 Ideographic description character U+2FF5.
723 ('\xe2\xbf\xb5',
724 None),
725 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000726 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000727 '\xcc\x81'),
728 # 3.34 Left-to-right mark U+200E.
729 ('\xe2\x80\x8e',
730 None),
731 # 3.35 Deprecated U+202A.
732 ('\xe2\x80\xaa',
733 None),
734 # 3.36 Language tagging character U+E0001.
735 ('\xf3\xa0\x80\x81',
736 None),
737 # 3.37 Language tagging character U+E0042.
738 ('\xf3\xa0\x81\x82',
739 None),
740 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
741 ('foo\xd6\xbebar',
742 None),
743 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
744 ('foo\xef\xb5\x90bar',
745 None),
746 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
747 ('foo\xef\xb9\xb6bar',
748 'foo \xd9\x8ebar'),
749 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
750 ('\xd8\xa71',
751 None),
752 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
753 ('\xd8\xa71\xd8\xa8',
754 '\xd8\xa71\xd8\xa8'),
755 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000756 # Skip this test as we allow unassigned
757 #('\xf3\xa0\x80\x82',
758 # None),
759 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000760 # 3.44 Larger test (shrinking).
761 # Original test case reads \xc3\xdf
762 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
763 '\xaa\xce\xb0\xe2\x80\x80',
764 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
765 # 3.45 Larger test (expanding).
766 # Original test case reads \xc3\x9f
767 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
768 '\x80',
769 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
770 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
771 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
772 ]
773
774
775class NameprepTest(unittest.TestCase):
776 def test_nameprep(self):
777 from encodings.idna import nameprep
778 for pos, (orig, prepped) in enumerate(nameprep_tests):
779 if orig is None:
780 # Skipped
781 continue
782 # The Unicode strings are given in UTF-8
783 orig = unicode(orig, "utf-8")
784 if prepped is None:
785 # Input contains prohibited characters
786 self.assertRaises(UnicodeError, nameprep, orig)
787 else:
788 prepped = unicode(prepped, "utf-8")
789 try:
790 self.assertEquals(nameprep(orig), prepped)
791 except Exception,e:
792 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
793
Walter Dörwald78a0be62006-04-14 18:25:39 +0000794class IDNACodecTest(unittest.TestCase):
795 def test_builtin_decode(self):
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000796 self.assertEquals(unicode("python.org", "idna"), u"python.org")
Walter Dörwald78a0be62006-04-14 18:25:39 +0000797 self.assertEquals(unicode("python.org.", "idna"), u"python.org.")
798 self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
799 self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
800
801 def test_builtin_encode(self):
802 self.assertEquals(u"python.org".encode("idna"), "python.org")
803 self.assertEquals("python.org.".encode("idna"), "python.org.")
804 self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
805 self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000806
Martin v. Löwis8b595142005-08-25 11:03:38 +0000807 def test_stream(self):
808 import StringIO
809 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
810 r.read(3)
811 self.assertEquals(r.read(), u"")
812
Walter Dörwald78a0be62006-04-14 18:25:39 +0000813 def test_incremental_decode(self):
814 self.assertEquals(
815 "".join(codecs.iterdecode("python.org", "idna")),
816 u"python.org"
817 )
818 self.assertEquals(
819 "".join(codecs.iterdecode("python.org.", "idna")),
820 u"python.org."
821 )
822 self.assertEquals(
823 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
824 u"pyth\xf6n.org."
825 )
826 self.assertEquals(
827 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
828 u"pyth\xf6n.org."
829 )
830
831 decoder = codecs.getincrementaldecoder("idna")()
832 self.assertEquals(decoder.decode("xn--xam", ), u"")
833 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
834 self.assertEquals(decoder.decode(u"rg"), u"")
835 self.assertEquals(decoder.decode(u"", True), u"org")
836
837 decoder.reset()
838 self.assertEquals(decoder.decode("xn--xam", ), u"")
839 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
840 self.assertEquals(decoder.decode("rg."), u"org.")
841 self.assertEquals(decoder.decode("", True), u"")
842
843 def test_incremental_encode(self):
844 self.assertEquals(
845 "".join(codecs.iterencode(u"python.org", "idna")),
846 "python.org"
847 )
848 self.assertEquals(
849 "".join(codecs.iterencode(u"python.org.", "idna")),
850 "python.org."
851 )
852 self.assertEquals(
853 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
854 "xn--pythn-mua.org."
855 )
856 self.assertEquals(
857 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
858 "xn--pythn-mua.org."
859 )
860
861 encoder = codecs.getincrementalencoder("idna")()
862 self.assertEquals(encoder.encode(u"\xe4x"), "")
863 self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
864 self.assertEquals(encoder.encode(u"", True), "org")
865
866 encoder.reset()
867 self.assertEquals(encoder.encode(u"\xe4x"), "")
868 self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
869 self.assertEquals(encoder.encode(u"", True), "")
870
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000871class CodecsModuleTest(unittest.TestCase):
872
873 def test_decode(self):
874 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
875 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000876 self.assertRaises(TypeError, codecs.decode)
877 self.assertEquals(codecs.decode('abc'), u'abc')
878 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
879
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000880 def test_encode(self):
881 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
882 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000883 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +0000884 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwald063e1e82004-10-28 13:04:26 +0000885 self.assertEquals(codecs.encode(u'abc'), 'abc')
886 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
887
888 def test_register(self):
889 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +0000890 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +0000891
892 def test_lookup(self):
893 self.assertRaises(TypeError, codecs.lookup)
894 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +0000895 self.assertRaises(LookupError, codecs.lookup, " ")
896
897 def test_getencoder(self):
898 self.assertRaises(TypeError, codecs.getencoder)
899 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
900
901 def test_getdecoder(self):
902 self.assertRaises(TypeError, codecs.getdecoder)
903 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
904
905 def test_getreader(self):
906 self.assertRaises(TypeError, codecs.getreader)
907 self.assertRaises(LookupError, codecs.getreader, "__spam__")
908
909 def test_getwriter(self):
910 self.assertRaises(TypeError, codecs.getwriter)
911 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000912
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000913class StreamReaderTest(unittest.TestCase):
914
915 def setUp(self):
916 self.reader = codecs.getreader('utf-8')
917 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
918
919 def test_readlines(self):
920 f = self.reader(self.stream)
921 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
922
Georg Brandl2a5a3022006-10-29 08:39:27 +0000923class EncodedFileTest(unittest.TestCase):
Neal Norwitz44dab0a2007-04-25 06:42:41 +0000924
Georg Brandl2a5a3022006-10-29 08:39:27 +0000925 def test_basic(self):
926 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandlb8205a12006-10-29 09:32:19 +0000927 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Georg Brandlf96b1622006-10-29 15:22:43 +0000928 self.assertEquals(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl2a5a3022006-10-29 08:39:27 +0000929
930 f = StringIO.StringIO()
931 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
932 ef.write('\xc3\xbc')
933 self.assertEquals(f.getvalue(), '\xfc')
934
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000935class Str2StrTest(unittest.TestCase):
936
937 def test_read(self):
938 sin = "\x80".encode("base64_codec")
939 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
940 sout = reader.read()
941 self.assertEqual(sout, "\x80")
942 self.assert_(isinstance(sout, str))
943
944 def test_readline(self):
945 sin = "\x80".encode("base64_codec")
946 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
947 sout = reader.readline()
948 self.assertEqual(sout, "\x80")
949 self.assert_(isinstance(sout, str))
950
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000951all_unicode_encodings = [
952 "ascii",
953 "base64_codec",
954 "big5",
955 "big5hkscs",
956 "charmap",
957 "cp037",
958 "cp1006",
959 "cp1026",
960 "cp1140",
961 "cp1250",
962 "cp1251",
963 "cp1252",
964 "cp1253",
965 "cp1254",
966 "cp1255",
967 "cp1256",
968 "cp1257",
969 "cp1258",
970 "cp424",
971 "cp437",
972 "cp500",
973 "cp737",
974 "cp775",
975 "cp850",
976 "cp852",
977 "cp855",
978 "cp856",
979 "cp857",
980 "cp860",
981 "cp861",
982 "cp862",
983 "cp863",
984 "cp864",
985 "cp865",
986 "cp866",
987 "cp869",
988 "cp874",
989 "cp875",
990 "cp932",
991 "cp949",
992 "cp950",
993 "euc_jis_2004",
994 "euc_jisx0213",
995 "euc_jp",
996 "euc_kr",
997 "gb18030",
998 "gb2312",
999 "gbk",
1000 "hex_codec",
1001 "hp_roman8",
1002 "hz",
1003 "idna",
1004 "iso2022_jp",
1005 "iso2022_jp_1",
1006 "iso2022_jp_2",
1007 "iso2022_jp_2004",
1008 "iso2022_jp_3",
1009 "iso2022_jp_ext",
1010 "iso2022_kr",
1011 "iso8859_1",
1012 "iso8859_10",
1013 "iso8859_11",
1014 "iso8859_13",
1015 "iso8859_14",
1016 "iso8859_15",
1017 "iso8859_16",
1018 "iso8859_2",
1019 "iso8859_3",
1020 "iso8859_4",
1021 "iso8859_5",
1022 "iso8859_6",
1023 "iso8859_7",
1024 "iso8859_8",
1025 "iso8859_9",
1026 "johab",
1027 "koi8_r",
1028 "koi8_u",
1029 "latin_1",
1030 "mac_cyrillic",
1031 "mac_greek",
1032 "mac_iceland",
1033 "mac_latin2",
1034 "mac_roman",
1035 "mac_turkish",
1036 "palmos",
1037 "ptcp154",
1038 "punycode",
1039 "raw_unicode_escape",
1040 "rot_13",
1041 "shift_jis",
1042 "shift_jis_2004",
1043 "shift_jisx0213",
1044 "tis_620",
1045 "unicode_escape",
1046 "unicode_internal",
1047 "utf_16",
1048 "utf_16_be",
1049 "utf_16_le",
1050 "utf_7",
1051 "utf_8",
1052]
1053
1054if hasattr(codecs, "mbcs_encode"):
1055 all_unicode_encodings.append("mbcs")
1056
1057# The following encodings work only with str, not unicode
1058all_string_encodings = [
1059 "quopri_codec",
1060 "string_escape",
1061 "uu_codec",
1062]
1063
1064# The following encoding is not tested, because it's not supposed
1065# to work:
1066# "undefined"
1067
1068# The following encodings don't work in stateful mode
1069broken_unicode_with_streams = [
1070 "base64_codec",
1071 "hex_codec",
1072 "punycode",
1073 "unicode_internal"
1074]
Georg Brandlc68d2cc2006-10-29 14:39:13 +00001075broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001076
1077try:
1078 import bz2
1079except ImportError:
1080 pass
1081else:
1082 all_unicode_encodings.append("bz2_codec")
1083 broken_unicode_with_streams.append("bz2_codec")
1084
1085try:
1086 import zlib
1087except ImportError:
1088 pass
1089else:
1090 all_unicode_encodings.append("zlib_codec")
1091 broken_unicode_with_streams.append("zlib_codec")
1092
1093class BasicUnicodeTest(unittest.TestCase):
1094 def test_basics(self):
1095 s = u"abc123" # all codecs should be able to encode these
1096 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001097 name = codecs.lookup(encoding).name
1098 if encoding.endswith("_codec"):
1099 name += "_codec"
1100 elif encoding == "latin_1":
1101 name = "latin_1"
1102 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001103 (bytes, size) = codecs.getencoder(encoding)(s)
1104 if encoding != "unicode_internal":
1105 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1106 (chars, size) = codecs.getdecoder(encoding)(bytes)
1107 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1108
1109 if encoding not in broken_unicode_with_streams:
1110 # check stream reader/writer
1111 q = Queue()
1112 writer = codecs.getwriter(encoding)(q)
1113 encodedresult = ""
1114 for c in s:
1115 writer.write(c)
1116 encodedresult += q.read()
1117 q = Queue()
1118 reader = codecs.getreader(encoding)(q)
1119 decodedresult = u""
1120 for c in encodedresult:
1121 q.write(c)
1122 decodedresult += reader.read()
1123 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1124
Georg Brandlc68d2cc2006-10-29 14:39:13 +00001125 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001126 # check incremental decoder/encoder (fetched via the Python
1127 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001128 try:
1129 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001130 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001131 except LookupError: # no IncrementalEncoder
1132 pass
1133 else:
1134 # check incremental decoder/encoder
1135 encodedresult = ""
1136 for c in s:
1137 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001138 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001139 decoder = codecs.getincrementaldecoder(encoding)()
1140 decodedresult = u""
1141 for c in encodedresult:
1142 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001143 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001144 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1145
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001146 # check C API
1147 encodedresult = ""
1148 for c in s:
1149 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001150 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001151 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1152 decodedresult = u""
1153 for c in encodedresult:
1154 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001155 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001156 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1157
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001158 # check iterencode()/iterdecode()
1159 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1160 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1161
1162 # check iterencode()/iterdecode() with empty string
1163 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1164 self.assertEqual(result, u"")
1165
Walter Dörwald729c31f2005-03-14 19:06:30 +00001166 def test_seek(self):
1167 # all codecs should be able to encode these
1168 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1169 for encoding in all_unicode_encodings:
1170 if encoding == "idna": # FIXME: See SF bug #1163178
1171 continue
1172 if encoding in broken_unicode_with_streams:
1173 continue
1174 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1175 for t in xrange(5):
1176 # Test that calling seek resets the internal codec state and buffers
1177 reader.seek(0, 0)
1178 line = reader.readline()
1179 self.assertEqual(s[:len(line)], line)
1180
Walter Dörwalde22d3392005-11-17 08:52:34 +00001181 def test_bad_decode_args(self):
1182 for encoding in all_unicode_encodings:
1183 decoder = codecs.getdecoder(encoding)
1184 self.assertRaises(TypeError, decoder)
1185 if encoding not in ("idna", "punycode"):
1186 self.assertRaises(TypeError, decoder, 42)
1187
1188 def test_bad_encode_args(self):
1189 for encoding in all_unicode_encodings:
1190 encoder = codecs.getencoder(encoding)
1191 self.assertRaises(TypeError, encoder)
1192
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001193 def test_encoding_map_type_initialized(self):
1194 from encodings import cp1140
1195 # This used to crash, we are only verifying there's no crash.
1196 table_type = type(cp1140.encoding_table)
1197 self.assertEqual(table_type, table_type)
1198
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001199class BasicStrTest(unittest.TestCase):
1200 def test_basics(self):
1201 s = "abc123"
1202 for encoding in all_string_encodings:
1203 (bytes, size) = codecs.getencoder(encoding)(s)
1204 self.assertEqual(size, len(s))
1205 (chars, size) = codecs.getdecoder(encoding)(bytes)
1206 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1207
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001208class CharmapTest(unittest.TestCase):
1209 def test_decode_with_string_map(self):
1210 self.assertEquals(
1211 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1212 (u"abc", 3)
1213 )
1214
1215 self.assertEquals(
1216 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1217 (u"ab\ufffd", 3)
1218 )
1219
1220 self.assertEquals(
1221 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1222 (u"ab\ufffd", 3)
1223 )
1224
1225 self.assertEquals(
1226 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1227 (u"ab", 3)
1228 )
1229
1230 self.assertEquals(
1231 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1232 (u"ab", 3)
1233 )
1234
1235 allbytes = "".join(chr(i) for i in xrange(256))
1236 self.assertEquals(
1237 codecs.charmap_decode(allbytes, "ignore", u""),
1238 (u"", len(allbytes))
1239 )
1240
Georg Brandl2a5a3022006-10-29 08:39:27 +00001241class WithStmtTest(unittest.TestCase):
1242 def test_encodedfile(self):
1243 f = StringIO.StringIO("\xc3\xbc")
1244 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1245 self.assertEquals(ef.read(), "\xfc")
1246
1247 def test_streamreaderwriter(self):
1248 f = StringIO.StringIO("\xc3\xbc")
1249 info = codecs.lookup("utf-8")
1250 with codecs.StreamReaderWriter(f, info.streamreader,
1251 info.streamwriter, 'strict') as srw:
1252 self.assertEquals(srw.read(), u"\xfc")
1253
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001254
Fred Drake2e2be372001-09-20 21:33:42 +00001255def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001256 test_support.run_unittest(
1257 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001258 UTF16LETest,
1259 UTF16BETest,
1260 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001261 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001262 UTF7Test,
1263 UTF16ExTest,
1264 ReadBufferTest,
1265 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001266 EscapeDecodeTest,
1267 RecodingTest,
1268 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001269 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001270 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00001271 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001272 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001273 StreamReaderTest,
Georg Brandl2a5a3022006-10-29 08:39:27 +00001274 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001275 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001276 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001277 BasicStrTest,
Georg Brandl2a5a3022006-10-29 08:39:27 +00001278 CharmapTest,
1279 WithStmtTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001280 )
Fred Drake2e2be372001-09-20 21:33:42 +00001281
1282
1283if __name__ == "__main__":
1284 test_main()