blob: 038962345d330f9cc88bc2faa05cead44d7b4bb4 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwald9ae019b2006-03-18 14:22:26 +00004import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
12
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwalde57d7b12004-12-21 22:24:00 +000026class ReadTest(unittest.TestCase):
27 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000028 # get a StreamReader for the encoding and feed the bytestring version
29 # of input to the reader byte by byte. Read every available from
30 # the StreamReader and check that the results equal the appropriate
31 # entries from partialresults.
32 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000033 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000034 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000035 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000036 q.write(c)
37 result += r.read()
38 self.assertEqual(result, partialresult)
39 # check that there's nothing left in the buffers
40 self.assertEqual(r.read(), u"")
41 self.assertEqual(r.bytebuffer, "")
42 self.assertEqual(r.charbuffer, u"")
43
Walter Dörwaldabb02e52006-03-15 11:35:15 +000044 # do the check again, this time using a incremental decoder
45 d = codecs.getincrementaldecoder(self.encoding)()
46 result = u""
47 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
48 result += d.decode(c)
49 self.assertEqual(result, partialresult)
50 # check that there's nothing left in the buffers
51 self.assertEqual(d.decode("", True), u"")
52 self.assertEqual(d.buffer, "")
53
54 # Check whether the rest method works properly
55 d.reset()
56 result = u""
57 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
58 result += d.decode(c)
59 self.assertEqual(result, partialresult)
60 # check that there's nothing left in the buffers
61 self.assertEqual(d.decode("", True), u"")
62 self.assertEqual(d.buffer, "")
63
64 # check iterdecode()
65 encoded = input.encode(self.encoding)
66 self.assertEqual(
67 input,
68 u"".join(codecs.iterdecode(encoded, self.encoding))
69 )
70
Walter Dörwalde57d7b12004-12-21 22:24:00 +000071 def test_readline(self):
72 def getreader(input):
73 stream = StringIO.StringIO(input.encode(self.encoding))
74 return codecs.getreader(self.encoding)(stream)
75
Walter Dörwaldca199432006-03-06 22:39:12 +000076 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 reader = getreader(input)
78 lines = []
79 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000080 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000081 if not line:
82 break
83 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000084 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000085
86 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000087 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
88 sexpectednoends = u"foo|bar|baz|spam|eggs"
89 self.assertEqual(readalllines(s, True), sexpected)
90 self.assertEqual(readalllines(s, False), sexpectednoends)
91 self.assertEqual(readalllines(s, True, 10), sexpected)
92 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000093
94 # Test long lines (multiple calls to read() in readline())
95 vw = []
96 vwo = []
97 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
98 vw.append((i*200)*u"\3042" + lineend)
99 vwo.append((i*200)*u"\3042")
100 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
101 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
102
103 # Test lines where the first read might end with \r, so the
104 # reader has to look ahead whether this is a lone \r or a \r\n
105 for size in xrange(80):
106 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000107 s = 10*(size*u"a" + lineend + u"xxx\n")
108 reader = getreader(s)
109 for i in xrange(10):
110 self.assertEqual(
111 reader.readline(keepends=True),
112 size*u"a" + lineend,
113 )
114 reader = getreader(s)
115 for i in xrange(10):
116 self.assertEqual(
117 reader.readline(keepends=False),
118 size*u"a",
119 )
120
121 def test_bug1175396(self):
122 s = [
123 '<%!--===================================================\r\n',
124 ' BLOG index page: show recent articles,\r\n',
125 ' today\'s articles, or articles of a specific date.\r\n',
126 '========================================================--%>\r\n',
127 '<%@inputencoding="ISO-8859-1"%>\r\n',
128 '<%@pagetemplate=TEMPLATE.y%>\r\n',
129 '<%@import=import frog.util, frog%>\r\n',
130 '<%@import=import frog.objects%>\r\n',
131 '<%@import=from frog.storageerrors import StorageError%>\r\n',
132 '<%\r\n',
133 '\r\n',
134 'import logging\r\n',
135 'log=logging.getLogger("Snakelets.logger")\r\n',
136 '\r\n',
137 '\r\n',
138 'user=self.SessionCtx.user\r\n',
139 'storageEngine=self.SessionCtx.storageEngine\r\n',
140 '\r\n',
141 '\r\n',
142 'def readArticlesFromDate(date, count=None):\r\n',
143 ' entryids=storageEngine.listBlogEntries(date)\r\n',
144 ' entryids.reverse() # descending\r\n',
145 ' if count:\r\n',
146 ' entryids=entryids[:count]\r\n',
147 ' try:\r\n',
148 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
149 ' except StorageError,x:\r\n',
150 ' log.error("Error loading articles: "+str(x))\r\n',
151 ' self.abort("cannot load articles")\r\n',
152 '\r\n',
153 'showdate=None\r\n',
154 '\r\n',
155 'arg=self.Request.getArg()\r\n',
156 'if arg=="today":\r\n',
157 ' #-------------------- TODAY\'S ARTICLES\r\n',
158 ' self.write("<h2>Today\'s articles</h2>")\r\n',
159 ' showdate = frog.util.isodatestr() \r\n',
160 ' entries = readArticlesFromDate(showdate)\r\n',
161 'elif arg=="active":\r\n',
162 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
163 ' self.Yredirect("active.y")\r\n',
164 'elif arg=="login":\r\n',
165 ' #-------------------- LOGIN PAGE redirect\r\n',
166 ' self.Yredirect("login.y")\r\n',
167 'elif arg=="date":\r\n',
168 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
169 ' showdate = self.Request.getParameter("date")\r\n',
170 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
171 ' entries = readArticlesFromDate(showdate)\r\n',
172 'else:\r\n',
173 ' #-------------------- RECENT ARTICLES\r\n',
174 ' self.write("<h2>Recent articles</h2>")\r\n',
175 ' dates=storageEngine.listBlogEntryDates()\r\n',
176 ' if dates:\r\n',
177 ' entries=[]\r\n',
178 ' SHOWAMOUNT=10\r\n',
179 ' for showdate in dates:\r\n',
180 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
181 ' if len(entries)>=SHOWAMOUNT:\r\n',
182 ' break\r\n',
183 ' \r\n',
184 ]
185 stream = StringIO.StringIO("".join(s).encode(self.encoding))
186 reader = codecs.getreader(self.encoding)(stream)
187 for (i, line) in enumerate(reader):
188 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000189
190 def test_readlinequeue(self):
191 q = Queue()
192 writer = codecs.getwriter(self.encoding)(q)
193 reader = codecs.getreader(self.encoding)(q)
194
195 # No lineends
196 writer.write(u"foo\r")
197 self.assertEqual(reader.readline(keepends=False), u"foo")
198 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000199 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000200 self.assertEqual(reader.readline(keepends=False), u"bar")
201 writer.write(u"baz")
202 self.assertEqual(reader.readline(keepends=False), u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"")
204
205 # Lineends
206 writer.write(u"foo\r")
207 self.assertEqual(reader.readline(keepends=True), u"foo\r")
208 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000209 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000210 self.assertEqual(reader.readline(keepends=True), u"bar\r")
211 writer.write(u"baz")
212 self.assertEqual(reader.readline(keepends=True), u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"")
214 writer.write(u"foo\r\n")
215 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
216
Walter Dörwald9fa09462005-01-10 12:01:39 +0000217 def test_bug1098990_a(self):
218 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
219 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
220 s3 = u"next line.\r\n"
221
222 s = (s1+s2+s3).encode(self.encoding)
223 stream = StringIO.StringIO(s)
224 reader = codecs.getreader(self.encoding)(stream)
225 self.assertEqual(reader.readline(), s1)
226 self.assertEqual(reader.readline(), s2)
227 self.assertEqual(reader.readline(), s3)
228 self.assertEqual(reader.readline(), u"")
229
230 def test_bug1098990_b(self):
231 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
232 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
233 s3 = u"stillokay:bbbbxx\r\n"
234 s4 = u"broken!!!!badbad\r\n"
235 s5 = u"againokay.\r\n"
236
237 s = (s1+s2+s3+s4+s5).encode(self.encoding)
238 stream = StringIO.StringIO(s)
239 reader = codecs.getreader(self.encoding)(stream)
240 self.assertEqual(reader.readline(), s1)
241 self.assertEqual(reader.readline(), s2)
242 self.assertEqual(reader.readline(), s3)
243 self.assertEqual(reader.readline(), s4)
244 self.assertEqual(reader.readline(), s5)
245 self.assertEqual(reader.readline(), u"")
246
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000247class UTF16Test(ReadTest):
248 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000249
250 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
251 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
252
253 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000254 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000255 # encode some stream
256 s = StringIO.StringIO()
257 f = writer(s)
258 f.write(u"spam")
259 f.write(u"spam")
260 d = s.getvalue()
261 # check whether there is exactly one BOM in it
262 self.assert_(d == self.spamle or d == self.spambe)
263 # try to read it back
264 s = StringIO.StringIO(d)
265 f = reader(s)
266 self.assertEquals(f.read(), u"spamspam")
267
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000268 def test_badbom(self):
269 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000270 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000271 self.assertRaises(UnicodeError, f.read)
272
273 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000274 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000275 self.assertRaises(UnicodeError, f.read)
276
Walter Dörwald69652032004-09-07 20:24:22 +0000277 def test_partial(self):
278 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000279 u"\x00\xff\u0100\uffff",
280 [
281 u"", # first byte of BOM read
282 u"", # second byte of BOM read => byteorder known
283 u"",
284 u"\x00",
285 u"\x00",
286 u"\x00\xff",
287 u"\x00\xff",
288 u"\x00\xff\u0100",
289 u"\x00\xff\u0100",
290 u"\x00\xff\u0100\uffff",
291 ]
292 )
293
Walter Dörwalde22d3392005-11-17 08:52:34 +0000294 def test_errors(self):
295 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
296
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000297class UTF16LETest(ReadTest):
298 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000299
300 def test_partial(self):
301 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000302 u"\x00\xff\u0100\uffff",
303 [
304 u"",
305 u"\x00",
306 u"\x00",
307 u"\x00\xff",
308 u"\x00\xff",
309 u"\x00\xff\u0100",
310 u"\x00\xff\u0100",
311 u"\x00\xff\u0100\uffff",
312 ]
313 )
314
Walter Dörwalde22d3392005-11-17 08:52:34 +0000315 def test_errors(self):
316 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
317
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000318class UTF16BETest(ReadTest):
319 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000320
321 def test_partial(self):
322 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000323 u"\x00\xff\u0100\uffff",
324 [
325 u"",
326 u"\x00",
327 u"\x00",
328 u"\x00\xff",
329 u"\x00\xff",
330 u"\x00\xff\u0100",
331 u"\x00\xff\u0100",
332 u"\x00\xff\u0100\uffff",
333 ]
334 )
335
Walter Dörwalde22d3392005-11-17 08:52:34 +0000336 def test_errors(self):
337 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
338
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000339class UTF8Test(ReadTest):
340 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000341
342 def test_partial(self):
343 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000344 u"\x00\xff\u07ff\u0800\uffff",
345 [
346 u"\x00",
347 u"\x00",
348 u"\x00\xff",
349 u"\x00\xff",
350 u"\x00\xff\u07ff",
351 u"\x00\xff\u07ff",
352 u"\x00\xff\u07ff",
353 u"\x00\xff\u07ff\u0800",
354 u"\x00\xff\u07ff\u0800",
355 u"\x00\xff\u07ff\u0800",
356 u"\x00\xff\u07ff\u0800\uffff",
357 ]
358 )
359
Walter Dörwalde22d3392005-11-17 08:52:34 +0000360class UTF7Test(ReadTest):
361 encoding = "utf-7"
362
363 # No test_partial() yet, because UTF-7 doesn't support it.
364
365class UTF16ExTest(unittest.TestCase):
366
367 def test_errors(self):
368 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
369
370 def test_bad_args(self):
371 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
372
373class ReadBufferTest(unittest.TestCase):
374
375 def test_array(self):
376 import array
377 self.assertEqual(
378 codecs.readbuffer_encode(array.array("c", "spam")),
379 ("spam", 4)
380 )
381
382 def test_empty(self):
383 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
384
385 def test_bad_args(self):
386 self.assertRaises(TypeError, codecs.readbuffer_encode)
387 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
388
389class CharBufferTest(unittest.TestCase):
390
391 def test_string(self):
392 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
393
394 def test_empty(self):
395 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
396
397 def test_bad_args(self):
398 self.assertRaises(TypeError, codecs.charbuffer_encode)
399 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
400
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000401class UTF8SigTest(ReadTest):
402 encoding = "utf-8-sig"
403
404 def test_partial(self):
405 self.check_partial(
406 u"\ufeff\x00\xff\u07ff\u0800\uffff",
407 [
408 u"",
409 u"",
410 u"", # First BOM has been read and skipped
411 u"",
412 u"",
413 u"\ufeff", # Second BOM has been read and emitted
414 u"\ufeff\x00", # "\x00" read and emitted
415 u"\ufeff\x00", # First byte of encoded u"\xff" read
416 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
417 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
418 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
419 u"\ufeff\x00\xff\u07ff",
420 u"\ufeff\x00\xff\u07ff",
421 u"\ufeff\x00\xff\u07ff\u0800",
422 u"\ufeff\x00\xff\u07ff\u0800",
423 u"\ufeff\x00\xff\u07ff\u0800",
424 u"\ufeff\x00\xff\u07ff\u0800\uffff",
425 ]
426 )
427
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000428 def test_bug1601501(self):
429 # SF bug #1601501: check that the codec works with a buffer
430 unicode("\xef\xbb\xbf", "utf-8-sig")
431
Walter Dörwald42348272007-04-12 10:35:00 +0000432 def test_bom(self):
433 d = codecs.getincrementaldecoder("utf-8-sig")()
434 s = u"spam"
435 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
436
Walter Dörwald8709a422002-09-03 13:53:40 +0000437class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000438 def test_empty(self):
Walter Dörwald8709a422002-09-03 13:53:40 +0000439 self.assertEquals(codecs.escape_decode(""), ("", 0))
440
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000441class RecodingTest(unittest.TestCase):
442 def test_recoding(self):
443 f = StringIO.StringIO()
444 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
445 f2.write(u"a")
446 f2.close()
447 # Python used to crash on this at exit because of a refcount
448 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000449
Martin v. Löwis2548c732003-04-18 10:39:54 +0000450# From RFC 3492
451punycode_testcases = [
452 # A Arabic (Egyptian):
453 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
454 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
455 "egbpdaj6bu4bxfgehfvwxn"),
456 # B Chinese (simplified):
457 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
458 "ihqwcrb4cv8a8dqg056pqjye"),
459 # C Chinese (traditional):
460 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
461 "ihqwctvzc91f659drss3x8bo0yb"),
462 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
463 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
464 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
465 u"\u0065\u0073\u006B\u0079",
466 "Proprostnemluvesky-uyb24dma41a"),
467 # E Hebrew:
468 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
469 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
470 u"\u05D1\u05E8\u05D9\u05EA",
471 "4dbcagdahymbxekheh6e0a7fei0b"),
472 # F Hindi (Devanagari):
473 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
474 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
475 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
476 u"\u0939\u0948\u0902",
477 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
478
479 #(G) Japanese (kanji and hiragana):
480 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
481 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
482 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
483
484 # (H) Korean (Hangul syllables):
485 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
486 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
487 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
488 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
489 "psd879ccm6fea98c"),
490
491 # (I) Russian (Cyrillic):
492 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
493 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
494 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
495 u"\u0438",
496 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
497
498 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
499 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
500 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
501 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
502 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
503 u"\u0061\u00F1\u006F\u006C",
504 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
505
506 # (K) Vietnamese:
507 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
508 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
509 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
510 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
511 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
512 u"\u0056\u0069\u1EC7\u0074",
513 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
514
Martin v. Löwis2548c732003-04-18 10:39:54 +0000515 #(L) 3<nen>B<gumi><kinpachi><sensei>
516 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
517 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000518
Martin v. Löwis2548c732003-04-18 10:39:54 +0000519 # (M) <amuro><namie>-with-SUPER-MONKEYS
520 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
521 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
522 u"\u004F\u004E\u004B\u0045\u0059\u0053",
523 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
524
525 # (N) Hello-Another-Way-<sorezore><no><basho>
526 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
527 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
528 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
529 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
530
531 # (O) <hitotsu><yane><no><shita>2
532 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
533 "2-u9tlzr9756bt3uc0v"),
534
535 # (P) Maji<de>Koi<suru>5<byou><mae>
536 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
537 u"\u308B\u0035\u79D2\u524D",
538 "MajiKoi5-783gue6qz075azm5e"),
539
540 # (Q) <pafii>de<runba>
541 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
542 "de-jg4avhby1noc0d"),
543
544 # (R) <sono><supiido><de>
545 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
546 "d9juau41awczczp"),
547
548 # (S) -> $1.00 <-
549 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
550 u"\u003C\u002D",
551 "-> $1.00 <--")
552 ]
553
554for i in punycode_testcases:
555 if len(i)!=2:
556 print repr(i)
557
558class PunycodeTest(unittest.TestCase):
559 def test_encode(self):
560 for uni, puny in punycode_testcases:
561 # Need to convert both strings to lower case, since
562 # some of the extended encodings use upper case, but our
563 # code produces only lower case. Converting just puny to
564 # lower is also insufficient, since some of the input characters
565 # are upper case.
566 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
567
568 def test_decode(self):
569 for uni, puny in punycode_testcases:
570 self.assertEquals(uni, puny.decode("punycode"))
571
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000572class UnicodeInternalTest(unittest.TestCase):
573 def test_bug1251300(self):
574 # Decoding with unicode_internal used to not correctly handle "code
575 # points" above 0x10ffff on UCS-4 builds.
576 if sys.maxunicode > 0xffff:
577 ok = [
578 ("\x00\x10\xff\xff", u"\U0010ffff"),
579 ("\x00\x00\x01\x01", u"\U00000101"),
580 ("", u""),
581 ]
582 not_ok = [
583 "\x7f\xff\xff\xff",
584 "\x80\x00\x00\x00",
585 "\x81\x00\x00\x00",
586 "\x00",
587 "\x00\x00\x00\x00\x00",
588 ]
589 for internal, uni in ok:
590 if sys.byteorder == "little":
591 internal = "".join(reversed(internal))
592 self.assertEquals(uni, internal.decode("unicode_internal"))
593 for internal in not_ok:
594 if sys.byteorder == "little":
595 internal = "".join(reversed(internal))
596 self.assertRaises(UnicodeDecodeError, internal.decode,
597 "unicode_internal")
598
599 def test_decode_error_attributes(self):
600 if sys.maxunicode > 0xffff:
601 try:
602 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
603 except UnicodeDecodeError, ex:
604 self.assertEquals("unicode_internal", ex.encoding)
605 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
606 self.assertEquals(4, ex.start)
607 self.assertEquals(8, ex.end)
608 else:
609 self.fail()
610
611 def test_decode_callback(self):
612 if sys.maxunicode > 0xffff:
613 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
614 decoder = codecs.getdecoder("unicode_internal")
615 ab = u"ab".encode("unicode_internal")
616 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
617 "UnicodeInternalTest")
618 self.assertEquals((u"ab", 12), ignored)
619
Martin v. Löwis2548c732003-04-18 10:39:54 +0000620# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
621nameprep_tests = [
622 # 3.1 Map to nothing.
623 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
624 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
625 '\xb8\x8f\xef\xbb\xbf',
626 'foobarbaz'),
627 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
628 ('CAFE',
629 'cafe'),
630 # 3.3 Case folding 8bit U+00DF (german sharp s).
631 # The original test case is bogus; it says \xc3\xdf
632 ('\xc3\x9f',
633 'ss'),
634 # 3.4 Case folding U+0130 (turkish capital I with dot).
635 ('\xc4\xb0',
636 'i\xcc\x87'),
637 # 3.5 Case folding multibyte U+0143 U+037A.
638 ('\xc5\x83\xcd\xba',
639 '\xc5\x84 \xce\xb9'),
640 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
641 # XXX: skip this as it fails in UCS-2 mode
642 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
643 # 'telc\xe2\x88\x95kg\xcf\x83'),
644 (None, None),
645 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
646 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
647 '\xc7\xb0 a'),
648 # 3.8 Case folding U+1FB7 and normalization.
649 ('\xe1\xbe\xb7',
650 '\xe1\xbe\xb6\xce\xb9'),
651 # 3.9 Self-reverting case folding U+01F0 and normalization.
652 # The original test case is bogus, it says `\xc7\xf0'
653 ('\xc7\xb0',
654 '\xc7\xb0'),
655 # 3.10 Self-reverting case folding U+0390 and normalization.
656 ('\xce\x90',
657 '\xce\x90'),
658 # 3.11 Self-reverting case folding U+03B0 and normalization.
659 ('\xce\xb0',
660 '\xce\xb0'),
661 # 3.12 Self-reverting case folding U+1E96 and normalization.
662 ('\xe1\xba\x96',
663 '\xe1\xba\x96'),
664 # 3.13 Self-reverting case folding U+1F56 and normalization.
665 ('\xe1\xbd\x96',
666 '\xe1\xbd\x96'),
667 # 3.14 ASCII space character U+0020.
668 (' ',
669 ' '),
670 # 3.15 Non-ASCII 8bit space character U+00A0.
671 ('\xc2\xa0',
672 ' '),
673 # 3.16 Non-ASCII multibyte space character U+1680.
674 ('\xe1\x9a\x80',
675 None),
676 # 3.17 Non-ASCII multibyte space character U+2000.
677 ('\xe2\x80\x80',
678 ' '),
679 # 3.18 Zero Width Space U+200b.
680 ('\xe2\x80\x8b',
681 ''),
682 # 3.19 Non-ASCII multibyte space character U+3000.
683 ('\xe3\x80\x80',
684 ' '),
685 # 3.20 ASCII control characters U+0010 U+007F.
686 ('\x10\x7f',
687 '\x10\x7f'),
688 # 3.21 Non-ASCII 8bit control character U+0085.
689 ('\xc2\x85',
690 None),
691 # 3.22 Non-ASCII multibyte control character U+180E.
692 ('\xe1\xa0\x8e',
693 None),
694 # 3.23 Zero Width No-Break Space U+FEFF.
695 ('\xef\xbb\xbf',
696 ''),
697 # 3.24 Non-ASCII control character U+1D175.
698 ('\xf0\x9d\x85\xb5',
699 None),
700 # 3.25 Plane 0 private use character U+F123.
701 ('\xef\x84\xa3',
702 None),
703 # 3.26 Plane 15 private use character U+F1234.
704 ('\xf3\xb1\x88\xb4',
705 None),
706 # 3.27 Plane 16 private use character U+10F234.
707 ('\xf4\x8f\x88\xb4',
708 None),
709 # 3.28 Non-character code point U+8FFFE.
710 ('\xf2\x8f\xbf\xbe',
711 None),
712 # 3.29 Non-character code point U+10FFFF.
713 ('\xf4\x8f\xbf\xbf',
714 None),
715 # 3.30 Surrogate code U+DF42.
716 ('\xed\xbd\x82',
717 None),
718 # 3.31 Non-plain text character U+FFFD.
719 ('\xef\xbf\xbd',
720 None),
721 # 3.32 Ideographic description character U+2FF5.
722 ('\xe2\xbf\xb5',
723 None),
724 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000725 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000726 '\xcc\x81'),
727 # 3.34 Left-to-right mark U+200E.
728 ('\xe2\x80\x8e',
729 None),
730 # 3.35 Deprecated U+202A.
731 ('\xe2\x80\xaa',
732 None),
733 # 3.36 Language tagging character U+E0001.
734 ('\xf3\xa0\x80\x81',
735 None),
736 # 3.37 Language tagging character U+E0042.
737 ('\xf3\xa0\x81\x82',
738 None),
739 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
740 ('foo\xd6\xbebar',
741 None),
742 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
743 ('foo\xef\xb5\x90bar',
744 None),
745 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
746 ('foo\xef\xb9\xb6bar',
747 'foo \xd9\x8ebar'),
748 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
749 ('\xd8\xa71',
750 None),
751 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
752 ('\xd8\xa71\xd8\xa8',
753 '\xd8\xa71\xd8\xa8'),
754 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000755 # Skip this test as we allow unassigned
756 #('\xf3\xa0\x80\x82',
757 # None),
758 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000759 # 3.44 Larger test (shrinking).
760 # Original test case reads \xc3\xdf
761 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
762 '\xaa\xce\xb0\xe2\x80\x80',
763 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
764 # 3.45 Larger test (expanding).
765 # Original test case reads \xc3\x9f
766 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
767 '\x80',
768 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
769 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
770 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
771 ]
772
773
774class NameprepTest(unittest.TestCase):
775 def test_nameprep(self):
776 from encodings.idna import nameprep
777 for pos, (orig, prepped) in enumerate(nameprep_tests):
778 if orig is None:
779 # Skipped
780 continue
781 # The Unicode strings are given in UTF-8
782 orig = unicode(orig, "utf-8")
783 if prepped is None:
784 # Input contains prohibited characters
785 self.assertRaises(UnicodeError, nameprep, orig)
786 else:
787 prepped = unicode(prepped, "utf-8")
788 try:
789 self.assertEquals(nameprep(orig), prepped)
790 except Exception,e:
791 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
792
Walter Dörwald78a0be62006-04-14 18:25:39 +0000793class IDNACodecTest(unittest.TestCase):
794 def test_builtin_decode(self):
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000795 self.assertEquals(unicode("python.org", "idna"), u"python.org")
Walter Dörwald78a0be62006-04-14 18:25:39 +0000796 self.assertEquals(unicode("python.org.", "idna"), u"python.org.")
797 self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
798 self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
799
800 def test_builtin_encode(self):
801 self.assertEquals(u"python.org".encode("idna"), "python.org")
802 self.assertEquals("python.org.".encode("idna"), "python.org.")
803 self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
804 self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000805
Martin v. Löwis8b595142005-08-25 11:03:38 +0000806 def test_stream(self):
807 import StringIO
808 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
809 r.read(3)
810 self.assertEquals(r.read(), u"")
811
Walter Dörwald78a0be62006-04-14 18:25:39 +0000812 def test_incremental_decode(self):
813 self.assertEquals(
814 "".join(codecs.iterdecode("python.org", "idna")),
815 u"python.org"
816 )
817 self.assertEquals(
818 "".join(codecs.iterdecode("python.org.", "idna")),
819 u"python.org."
820 )
821 self.assertEquals(
822 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
823 u"pyth\xf6n.org."
824 )
825 self.assertEquals(
826 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
827 u"pyth\xf6n.org."
828 )
829
830 decoder = codecs.getincrementaldecoder("idna")()
831 self.assertEquals(decoder.decode("xn--xam", ), u"")
832 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
833 self.assertEquals(decoder.decode(u"rg"), u"")
834 self.assertEquals(decoder.decode(u"", True), u"org")
835
836 decoder.reset()
837 self.assertEquals(decoder.decode("xn--xam", ), u"")
838 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
839 self.assertEquals(decoder.decode("rg."), u"org.")
840 self.assertEquals(decoder.decode("", True), u"")
841
842 def test_incremental_encode(self):
843 self.assertEquals(
844 "".join(codecs.iterencode(u"python.org", "idna")),
845 "python.org"
846 )
847 self.assertEquals(
848 "".join(codecs.iterencode(u"python.org.", "idna")),
849 "python.org."
850 )
851 self.assertEquals(
852 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
853 "xn--pythn-mua.org."
854 )
855 self.assertEquals(
856 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
857 "xn--pythn-mua.org."
858 )
859
860 encoder = codecs.getincrementalencoder("idna")()
861 self.assertEquals(encoder.encode(u"\xe4x"), "")
862 self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
863 self.assertEquals(encoder.encode(u"", True), "org")
864
865 encoder.reset()
866 self.assertEquals(encoder.encode(u"\xe4x"), "")
867 self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
868 self.assertEquals(encoder.encode(u"", True), "")
869
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000870class CodecsModuleTest(unittest.TestCase):
871
872 def test_decode(self):
873 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
874 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000875 self.assertRaises(TypeError, codecs.decode)
876 self.assertEquals(codecs.decode('abc'), u'abc')
877 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
878
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000879 def test_encode(self):
880 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
881 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000882 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +0000883 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwald063e1e82004-10-28 13:04:26 +0000884 self.assertEquals(codecs.encode(u'abc'), 'abc')
885 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
886
887 def test_register(self):
888 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +0000889 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +0000890
891 def test_lookup(self):
892 self.assertRaises(TypeError, codecs.lookup)
893 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +0000894 self.assertRaises(LookupError, codecs.lookup, " ")
895
896 def test_getencoder(self):
897 self.assertRaises(TypeError, codecs.getencoder)
898 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
899
900 def test_getdecoder(self):
901 self.assertRaises(TypeError, codecs.getdecoder)
902 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
903
904 def test_getreader(self):
905 self.assertRaises(TypeError, codecs.getreader)
906 self.assertRaises(LookupError, codecs.getreader, "__spam__")
907
908 def test_getwriter(self):
909 self.assertRaises(TypeError, codecs.getwriter)
910 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000911
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000912class StreamReaderTest(unittest.TestCase):
913
914 def setUp(self):
915 self.reader = codecs.getreader('utf-8')
916 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
917
918 def test_readlines(self):
919 f = self.reader(self.stream)
920 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
921
Georg Brandl8f99f812006-10-29 08:39:22 +0000922class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +0000923
Georg Brandl8f99f812006-10-29 08:39:22 +0000924 def test_basic(self):
925 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +0000926 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
927 self.assertEquals(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +0000928
929 f = StringIO.StringIO()
930 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
931 ef.write('\xc3\xbc')
932 self.assertEquals(f.getvalue(), '\xfc')
933
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000934class Str2StrTest(unittest.TestCase):
935
936 def test_read(self):
937 sin = "\x80".encode("base64_codec")
938 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
939 sout = reader.read()
940 self.assertEqual(sout, "\x80")
941 self.assert_(isinstance(sout, str))
942
943 def test_readline(self):
944 sin = "\x80".encode("base64_codec")
945 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
946 sout = reader.readline()
947 self.assertEqual(sout, "\x80")
948 self.assert_(isinstance(sout, str))
949
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000950all_unicode_encodings = [
951 "ascii",
952 "base64_codec",
953 "big5",
954 "big5hkscs",
955 "charmap",
956 "cp037",
957 "cp1006",
958 "cp1026",
959 "cp1140",
960 "cp1250",
961 "cp1251",
962 "cp1252",
963 "cp1253",
964 "cp1254",
965 "cp1255",
966 "cp1256",
967 "cp1257",
968 "cp1258",
969 "cp424",
970 "cp437",
971 "cp500",
972 "cp737",
973 "cp775",
974 "cp850",
975 "cp852",
976 "cp855",
977 "cp856",
978 "cp857",
979 "cp860",
980 "cp861",
981 "cp862",
982 "cp863",
983 "cp864",
984 "cp865",
985 "cp866",
986 "cp869",
987 "cp874",
988 "cp875",
989 "cp932",
990 "cp949",
991 "cp950",
992 "euc_jis_2004",
993 "euc_jisx0213",
994 "euc_jp",
995 "euc_kr",
996 "gb18030",
997 "gb2312",
998 "gbk",
999 "hex_codec",
1000 "hp_roman8",
1001 "hz",
1002 "idna",
1003 "iso2022_jp",
1004 "iso2022_jp_1",
1005 "iso2022_jp_2",
1006 "iso2022_jp_2004",
1007 "iso2022_jp_3",
1008 "iso2022_jp_ext",
1009 "iso2022_kr",
1010 "iso8859_1",
1011 "iso8859_10",
1012 "iso8859_11",
1013 "iso8859_13",
1014 "iso8859_14",
1015 "iso8859_15",
1016 "iso8859_16",
1017 "iso8859_2",
1018 "iso8859_3",
1019 "iso8859_4",
1020 "iso8859_5",
1021 "iso8859_6",
1022 "iso8859_7",
1023 "iso8859_8",
1024 "iso8859_9",
1025 "johab",
1026 "koi8_r",
1027 "koi8_u",
1028 "latin_1",
1029 "mac_cyrillic",
1030 "mac_greek",
1031 "mac_iceland",
1032 "mac_latin2",
1033 "mac_roman",
1034 "mac_turkish",
1035 "palmos",
1036 "ptcp154",
1037 "punycode",
1038 "raw_unicode_escape",
1039 "rot_13",
1040 "shift_jis",
1041 "shift_jis_2004",
1042 "shift_jisx0213",
1043 "tis_620",
1044 "unicode_escape",
1045 "unicode_internal",
1046 "utf_16",
1047 "utf_16_be",
1048 "utf_16_le",
1049 "utf_7",
1050 "utf_8",
1051]
1052
1053if hasattr(codecs, "mbcs_encode"):
1054 all_unicode_encodings.append("mbcs")
1055
1056# The following encodings work only with str, not unicode
1057all_string_encodings = [
1058 "quopri_codec",
1059 "string_escape",
1060 "uu_codec",
1061]
1062
1063# The following encoding is not tested, because it's not supposed
1064# to work:
1065# "undefined"
1066
1067# The following encodings don't work in stateful mode
1068broken_unicode_with_streams = [
1069 "base64_codec",
1070 "hex_codec",
1071 "punycode",
1072 "unicode_internal"
1073]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001074broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001075
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001076# The following encodings only support "strict" mode
1077only_strict_mode = [
1078 "idna",
1079 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001080 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001081]
1082
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001083try:
1084 import bz2
1085except ImportError:
1086 pass
1087else:
1088 all_unicode_encodings.append("bz2_codec")
1089 broken_unicode_with_streams.append("bz2_codec")
1090
1091try:
1092 import zlib
1093except ImportError:
1094 pass
1095else:
1096 all_unicode_encodings.append("zlib_codec")
1097 broken_unicode_with_streams.append("zlib_codec")
1098
1099class BasicUnicodeTest(unittest.TestCase):
1100 def test_basics(self):
1101 s = u"abc123" # all codecs should be able to encode these
1102 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001103 name = codecs.lookup(encoding).name
1104 if encoding.endswith("_codec"):
1105 name += "_codec"
1106 elif encoding == "latin_1":
1107 name = "latin_1"
1108 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001109 (bytes, size) = codecs.getencoder(encoding)(s)
1110 if encoding != "unicode_internal":
1111 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1112 (chars, size) = codecs.getdecoder(encoding)(bytes)
1113 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1114
1115 if encoding not in broken_unicode_with_streams:
1116 # check stream reader/writer
1117 q = Queue()
1118 writer = codecs.getwriter(encoding)(q)
1119 encodedresult = ""
1120 for c in s:
1121 writer.write(c)
1122 encodedresult += q.read()
1123 q = Queue()
1124 reader = codecs.getreader(encoding)(q)
1125 decodedresult = u""
1126 for c in encodedresult:
1127 q.write(c)
1128 decodedresult += reader.read()
1129 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1130
Georg Brandl2c9838e2006-10-29 14:39:09 +00001131 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001132 # check incremental decoder/encoder (fetched via the Python
1133 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001134 try:
1135 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001136 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001137 except LookupError: # no IncrementalEncoder
1138 pass
1139 else:
1140 # check incremental decoder/encoder
1141 encodedresult = ""
1142 for c in s:
1143 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001144 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001145 decoder = codecs.getincrementaldecoder(encoding)()
1146 decodedresult = u""
1147 for c in encodedresult:
1148 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001149 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001150 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1151
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001152 # check C API
1153 encodedresult = ""
1154 for c in s:
1155 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001156 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001157 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1158 decodedresult = u""
1159 for c in encodedresult:
1160 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001161 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001162 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1163
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001164 # check iterencode()/iterdecode()
1165 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1166 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1167
1168 # check iterencode()/iterdecode() with empty string
1169 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1170 self.assertEqual(result, u"")
1171
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001172 if encoding not in only_strict_mode:
1173 # check incremental decoder/encoder with errors argument
1174 try:
1175 encoder = codecs.getincrementalencoder(encoding)("ignore")
1176 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1177 except LookupError: # no IncrementalEncoder
1178 pass
1179 else:
1180 encodedresult = "".join(encoder.encode(c) for c in s)
1181 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1182 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1183 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
Tim Petersabd8a332006-11-03 02:32:46 +00001184
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001185 encodedresult = "".join(cencoder.encode(c) for c in s)
1186 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1187 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1188 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1189
Walter Dörwald729c31f2005-03-14 19:06:30 +00001190 def test_seek(self):
1191 # all codecs should be able to encode these
1192 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1193 for encoding in all_unicode_encodings:
1194 if encoding == "idna": # FIXME: See SF bug #1163178
1195 continue
1196 if encoding in broken_unicode_with_streams:
1197 continue
1198 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1199 for t in xrange(5):
1200 # Test that calling seek resets the internal codec state and buffers
1201 reader.seek(0, 0)
1202 line = reader.readline()
1203 self.assertEqual(s[:len(line)], line)
1204
Walter Dörwalde22d3392005-11-17 08:52:34 +00001205 def test_bad_decode_args(self):
1206 for encoding in all_unicode_encodings:
1207 decoder = codecs.getdecoder(encoding)
1208 self.assertRaises(TypeError, decoder)
1209 if encoding not in ("idna", "punycode"):
1210 self.assertRaises(TypeError, decoder, 42)
1211
1212 def test_bad_encode_args(self):
1213 for encoding in all_unicode_encodings:
1214 encoder = codecs.getencoder(encoding)
1215 self.assertRaises(TypeError, encoder)
1216
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001217 def test_encoding_map_type_initialized(self):
1218 from encodings import cp1140
1219 # This used to crash, we are only verifying there's no crash.
1220 table_type = type(cp1140.encoding_table)
1221 self.assertEqual(table_type, table_type)
1222
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001223class BasicStrTest(unittest.TestCase):
1224 def test_basics(self):
1225 s = "abc123"
1226 for encoding in all_string_encodings:
1227 (bytes, size) = codecs.getencoder(encoding)(s)
1228 self.assertEqual(size, len(s))
1229 (chars, size) = codecs.getdecoder(encoding)(bytes)
1230 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1231
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001232class CharmapTest(unittest.TestCase):
1233 def test_decode_with_string_map(self):
1234 self.assertEquals(
1235 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1236 (u"abc", 3)
1237 )
1238
1239 self.assertEquals(
1240 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1241 (u"ab\ufffd", 3)
1242 )
1243
1244 self.assertEquals(
1245 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1246 (u"ab\ufffd", 3)
1247 )
1248
1249 self.assertEquals(
1250 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1251 (u"ab", 3)
1252 )
1253
1254 self.assertEquals(
1255 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1256 (u"ab", 3)
1257 )
1258
1259 allbytes = "".join(chr(i) for i in xrange(256))
1260 self.assertEquals(
1261 codecs.charmap_decode(allbytes, "ignore", u""),
1262 (u"", len(allbytes))
1263 )
1264
Georg Brandl8f99f812006-10-29 08:39:22 +00001265class WithStmtTest(unittest.TestCase):
1266 def test_encodedfile(self):
1267 f = StringIO.StringIO("\xc3\xbc")
1268 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1269 self.assertEquals(ef.read(), "\xfc")
1270
1271 def test_streamreaderwriter(self):
1272 f = StringIO.StringIO("\xc3\xbc")
1273 info = codecs.lookup("utf-8")
1274 with codecs.StreamReaderWriter(f, info.streamreader,
1275 info.streamwriter, 'strict') as srw:
1276 self.assertEquals(srw.read(), u"\xfc")
1277
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001278
Fred Drake2e2be372001-09-20 21:33:42 +00001279def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001280 test_support.run_unittest(
1281 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001282 UTF16LETest,
1283 UTF16BETest,
1284 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001285 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001286 UTF7Test,
1287 UTF16ExTest,
1288 ReadBufferTest,
1289 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001290 EscapeDecodeTest,
1291 RecodingTest,
1292 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001293 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001294 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00001295 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001296 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001297 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001298 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001299 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001300 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001301 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001302 CharmapTest,
1303 WithStmtTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001304 )
Fred Drake2e2be372001-09-20 21:33:42 +00001305
1306
1307if __name__ == "__main__":
1308 test_main()