blob: 21cd1b927cd03dc94309eca8fb3e71c7ccc78010 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwald9ae019b2006-03-18 14:22:26 +00004import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
12
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwalde57d7b12004-12-21 22:24:00 +000026class ReadTest(unittest.TestCase):
27 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000028 # get a StreamReader for the encoding and feed the bytestring version
29 # of input to the reader byte by byte. Read every available from
30 # the StreamReader and check that the results equal the appropriate
31 # entries from partialresults.
32 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000033 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000034 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000035 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000036 q.write(c)
37 result += r.read()
38 self.assertEqual(result, partialresult)
39 # check that there's nothing left in the buffers
40 self.assertEqual(r.read(), u"")
41 self.assertEqual(r.bytebuffer, "")
42 self.assertEqual(r.charbuffer, u"")
43
Walter Dörwaldabb02e52006-03-15 11:35:15 +000044 # do the check again, this time using a incremental decoder
45 d = codecs.getincrementaldecoder(self.encoding)()
46 result = u""
47 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
48 result += d.decode(c)
49 self.assertEqual(result, partialresult)
50 # check that there's nothing left in the buffers
51 self.assertEqual(d.decode("", True), u"")
52 self.assertEqual(d.buffer, "")
53
54 # Check whether the rest method works properly
55 d.reset()
56 result = u""
57 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
58 result += d.decode(c)
59 self.assertEqual(result, partialresult)
60 # check that there's nothing left in the buffers
61 self.assertEqual(d.decode("", True), u"")
62 self.assertEqual(d.buffer, "")
63
64 # check iterdecode()
65 encoded = input.encode(self.encoding)
66 self.assertEqual(
67 input,
68 u"".join(codecs.iterdecode(encoded, self.encoding))
69 )
70
Walter Dörwalde57d7b12004-12-21 22:24:00 +000071 def test_readline(self):
72 def getreader(input):
73 stream = StringIO.StringIO(input.encode(self.encoding))
74 return codecs.getreader(self.encoding)(stream)
75
Walter Dörwaldca199432006-03-06 22:39:12 +000076 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 reader = getreader(input)
78 lines = []
79 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000080 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000081 if not line:
82 break
83 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000084 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000085
86 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000087 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
88 sexpectednoends = u"foo|bar|baz|spam|eggs"
89 self.assertEqual(readalllines(s, True), sexpected)
90 self.assertEqual(readalllines(s, False), sexpectednoends)
91 self.assertEqual(readalllines(s, True, 10), sexpected)
92 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000093
94 # Test long lines (multiple calls to read() in readline())
95 vw = []
96 vwo = []
97 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
98 vw.append((i*200)*u"\3042" + lineend)
99 vwo.append((i*200)*u"\3042")
100 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
101 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
102
103 # Test lines where the first read might end with \r, so the
104 # reader has to look ahead whether this is a lone \r or a \r\n
105 for size in xrange(80):
106 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000107 s = 10*(size*u"a" + lineend + u"xxx\n")
108 reader = getreader(s)
109 for i in xrange(10):
110 self.assertEqual(
111 reader.readline(keepends=True),
112 size*u"a" + lineend,
113 )
114 reader = getreader(s)
115 for i in xrange(10):
116 self.assertEqual(
117 reader.readline(keepends=False),
118 size*u"a",
119 )
120
121 def test_bug1175396(self):
122 s = [
123 '<%!--===================================================\r\n',
124 ' BLOG index page: show recent articles,\r\n',
125 ' today\'s articles, or articles of a specific date.\r\n',
126 '========================================================--%>\r\n',
127 '<%@inputencoding="ISO-8859-1"%>\r\n',
128 '<%@pagetemplate=TEMPLATE.y%>\r\n',
129 '<%@import=import frog.util, frog%>\r\n',
130 '<%@import=import frog.objects%>\r\n',
131 '<%@import=from frog.storageerrors import StorageError%>\r\n',
132 '<%\r\n',
133 '\r\n',
134 'import logging\r\n',
135 'log=logging.getLogger("Snakelets.logger")\r\n',
136 '\r\n',
137 '\r\n',
138 'user=self.SessionCtx.user\r\n',
139 'storageEngine=self.SessionCtx.storageEngine\r\n',
140 '\r\n',
141 '\r\n',
142 'def readArticlesFromDate(date, count=None):\r\n',
143 ' entryids=storageEngine.listBlogEntries(date)\r\n',
144 ' entryids.reverse() # descending\r\n',
145 ' if count:\r\n',
146 ' entryids=entryids[:count]\r\n',
147 ' try:\r\n',
148 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
149 ' except StorageError,x:\r\n',
150 ' log.error("Error loading articles: "+str(x))\r\n',
151 ' self.abort("cannot load articles")\r\n',
152 '\r\n',
153 'showdate=None\r\n',
154 '\r\n',
155 'arg=self.Request.getArg()\r\n',
156 'if arg=="today":\r\n',
157 ' #-------------------- TODAY\'S ARTICLES\r\n',
158 ' self.write("<h2>Today\'s articles</h2>")\r\n',
159 ' showdate = frog.util.isodatestr() \r\n',
160 ' entries = readArticlesFromDate(showdate)\r\n',
161 'elif arg=="active":\r\n',
162 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
163 ' self.Yredirect("active.y")\r\n',
164 'elif arg=="login":\r\n',
165 ' #-------------------- LOGIN PAGE redirect\r\n',
166 ' self.Yredirect("login.y")\r\n',
167 'elif arg=="date":\r\n',
168 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
169 ' showdate = self.Request.getParameter("date")\r\n',
170 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
171 ' entries = readArticlesFromDate(showdate)\r\n',
172 'else:\r\n',
173 ' #-------------------- RECENT ARTICLES\r\n',
174 ' self.write("<h2>Recent articles</h2>")\r\n',
175 ' dates=storageEngine.listBlogEntryDates()\r\n',
176 ' if dates:\r\n',
177 ' entries=[]\r\n',
178 ' SHOWAMOUNT=10\r\n',
179 ' for showdate in dates:\r\n',
180 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
181 ' if len(entries)>=SHOWAMOUNT:\r\n',
182 ' break\r\n',
183 ' \r\n',
184 ]
185 stream = StringIO.StringIO("".join(s).encode(self.encoding))
186 reader = codecs.getreader(self.encoding)(stream)
187 for (i, line) in enumerate(reader):
188 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000189
190 def test_readlinequeue(self):
191 q = Queue()
192 writer = codecs.getwriter(self.encoding)(q)
193 reader = codecs.getreader(self.encoding)(q)
194
195 # No lineends
196 writer.write(u"foo\r")
197 self.assertEqual(reader.readline(keepends=False), u"foo")
198 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000199 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000200 self.assertEqual(reader.readline(keepends=False), u"bar")
201 writer.write(u"baz")
202 self.assertEqual(reader.readline(keepends=False), u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"")
204
205 # Lineends
206 writer.write(u"foo\r")
207 self.assertEqual(reader.readline(keepends=True), u"foo\r")
208 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000209 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000210 self.assertEqual(reader.readline(keepends=True), u"bar\r")
211 writer.write(u"baz")
212 self.assertEqual(reader.readline(keepends=True), u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"")
214 writer.write(u"foo\r\n")
215 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
216
Walter Dörwald9fa09462005-01-10 12:01:39 +0000217 def test_bug1098990_a(self):
218 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
219 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
220 s3 = u"next line.\r\n"
221
222 s = (s1+s2+s3).encode(self.encoding)
223 stream = StringIO.StringIO(s)
224 reader = codecs.getreader(self.encoding)(stream)
225 self.assertEqual(reader.readline(), s1)
226 self.assertEqual(reader.readline(), s2)
227 self.assertEqual(reader.readline(), s3)
228 self.assertEqual(reader.readline(), u"")
229
230 def test_bug1098990_b(self):
231 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
232 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
233 s3 = u"stillokay:bbbbxx\r\n"
234 s4 = u"broken!!!!badbad\r\n"
235 s5 = u"againokay.\r\n"
236
237 s = (s1+s2+s3+s4+s5).encode(self.encoding)
238 stream = StringIO.StringIO(s)
239 reader = codecs.getreader(self.encoding)(stream)
240 self.assertEqual(reader.readline(), s1)
241 self.assertEqual(reader.readline(), s2)
242 self.assertEqual(reader.readline(), s3)
243 self.assertEqual(reader.readline(), s4)
244 self.assertEqual(reader.readline(), s5)
245 self.assertEqual(reader.readline(), u"")
246
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000247class UTF16Test(ReadTest):
248 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000249
250 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
251 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
252
253 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000254 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000255 # encode some stream
256 s = StringIO.StringIO()
257 f = writer(s)
258 f.write(u"spam")
259 f.write(u"spam")
260 d = s.getvalue()
261 # check whether there is exactly one BOM in it
262 self.assert_(d == self.spamle or d == self.spambe)
263 # try to read it back
264 s = StringIO.StringIO(d)
265 f = reader(s)
266 self.assertEquals(f.read(), u"spamspam")
267
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000268 def test_badbom(self):
269 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000270 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000271 self.assertRaises(UnicodeError, f.read)
272
273 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000274 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000275 self.assertRaises(UnicodeError, f.read)
276
Walter Dörwald69652032004-09-07 20:24:22 +0000277 def test_partial(self):
278 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000279 u"\x00\xff\u0100\uffff",
280 [
281 u"", # first byte of BOM read
282 u"", # second byte of BOM read => byteorder known
283 u"",
284 u"\x00",
285 u"\x00",
286 u"\x00\xff",
287 u"\x00\xff",
288 u"\x00\xff\u0100",
289 u"\x00\xff\u0100",
290 u"\x00\xff\u0100\uffff",
291 ]
292 )
293
Walter Dörwalde22d3392005-11-17 08:52:34 +0000294 def test_errors(self):
295 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
296
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000297class UTF16LETest(ReadTest):
298 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000299
300 def test_partial(self):
301 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000302 u"\x00\xff\u0100\uffff",
303 [
304 u"",
305 u"\x00",
306 u"\x00",
307 u"\x00\xff",
308 u"\x00\xff",
309 u"\x00\xff\u0100",
310 u"\x00\xff\u0100",
311 u"\x00\xff\u0100\uffff",
312 ]
313 )
314
Walter Dörwalde22d3392005-11-17 08:52:34 +0000315 def test_errors(self):
316 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
317
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000318class UTF16BETest(ReadTest):
319 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000320
321 def test_partial(self):
322 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000323 u"\x00\xff\u0100\uffff",
324 [
325 u"",
326 u"\x00",
327 u"\x00",
328 u"\x00\xff",
329 u"\x00\xff",
330 u"\x00\xff\u0100",
331 u"\x00\xff\u0100",
332 u"\x00\xff\u0100\uffff",
333 ]
334 )
335
Walter Dörwalde22d3392005-11-17 08:52:34 +0000336 def test_errors(self):
337 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
338
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000339class UTF8Test(ReadTest):
340 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000341
342 def test_partial(self):
343 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000344 u"\x00\xff\u07ff\u0800\uffff",
345 [
346 u"\x00",
347 u"\x00",
348 u"\x00\xff",
349 u"\x00\xff",
350 u"\x00\xff\u07ff",
351 u"\x00\xff\u07ff",
352 u"\x00\xff\u07ff",
353 u"\x00\xff\u07ff\u0800",
354 u"\x00\xff\u07ff\u0800",
355 u"\x00\xff\u07ff\u0800",
356 u"\x00\xff\u07ff\u0800\uffff",
357 ]
358 )
359
Walter Dörwalde22d3392005-11-17 08:52:34 +0000360class UTF7Test(ReadTest):
361 encoding = "utf-7"
362
363 # No test_partial() yet, because UTF-7 doesn't support it.
364
365class UTF16ExTest(unittest.TestCase):
366
367 def test_errors(self):
368 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
369
370 def test_bad_args(self):
371 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
372
373class ReadBufferTest(unittest.TestCase):
374
375 def test_array(self):
376 import array
377 self.assertEqual(
378 codecs.readbuffer_encode(array.array("c", "spam")),
379 ("spam", 4)
380 )
381
382 def test_empty(self):
383 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
384
385 def test_bad_args(self):
386 self.assertRaises(TypeError, codecs.readbuffer_encode)
387 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
388
389class CharBufferTest(unittest.TestCase):
390
391 def test_string(self):
392 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
393
394 def test_empty(self):
395 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
396
397 def test_bad_args(self):
398 self.assertRaises(TypeError, codecs.charbuffer_encode)
399 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
400
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000401class UTF8SigTest(ReadTest):
402 encoding = "utf-8-sig"
403
404 def test_partial(self):
405 self.check_partial(
406 u"\ufeff\x00\xff\u07ff\u0800\uffff",
407 [
408 u"",
409 u"",
410 u"", # First BOM has been read and skipped
411 u"",
412 u"",
413 u"\ufeff", # Second BOM has been read and emitted
414 u"\ufeff\x00", # "\x00" read and emitted
415 u"\ufeff\x00", # First byte of encoded u"\xff" read
416 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
417 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
418 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
419 u"\ufeff\x00\xff\u07ff",
420 u"\ufeff\x00\xff\u07ff",
421 u"\ufeff\x00\xff\u07ff\u0800",
422 u"\ufeff\x00\xff\u07ff\u0800",
423 u"\ufeff\x00\xff\u07ff\u0800",
424 u"\ufeff\x00\xff\u07ff\u0800\uffff",
425 ]
426 )
427
Walter Dörwald8709a422002-09-03 13:53:40 +0000428class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000429 def test_empty(self):
Walter Dörwald8709a422002-09-03 13:53:40 +0000430 self.assertEquals(codecs.escape_decode(""), ("", 0))
431
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000432class RecodingTest(unittest.TestCase):
433 def test_recoding(self):
434 f = StringIO.StringIO()
435 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
436 f2.write(u"a")
437 f2.close()
438 # Python used to crash on this at exit because of a refcount
439 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000440
Martin v. Löwis2548c732003-04-18 10:39:54 +0000441# From RFC 3492
442punycode_testcases = [
443 # A Arabic (Egyptian):
444 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
445 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
446 "egbpdaj6bu4bxfgehfvwxn"),
447 # B Chinese (simplified):
448 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
449 "ihqwcrb4cv8a8dqg056pqjye"),
450 # C Chinese (traditional):
451 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
452 "ihqwctvzc91f659drss3x8bo0yb"),
453 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
454 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
455 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
456 u"\u0065\u0073\u006B\u0079",
457 "Proprostnemluvesky-uyb24dma41a"),
458 # E Hebrew:
459 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
460 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
461 u"\u05D1\u05E8\u05D9\u05EA",
462 "4dbcagdahymbxekheh6e0a7fei0b"),
463 # F Hindi (Devanagari):
464 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
465 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
466 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
467 u"\u0939\u0948\u0902",
468 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
469
470 #(G) Japanese (kanji and hiragana):
471 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
472 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
473 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
474
475 # (H) Korean (Hangul syllables):
476 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
477 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
478 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
479 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
480 "psd879ccm6fea98c"),
481
482 # (I) Russian (Cyrillic):
483 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
484 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
485 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
486 u"\u0438",
487 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
488
489 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
490 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
491 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
492 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
493 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
494 u"\u0061\u00F1\u006F\u006C",
495 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
496
497 # (K) Vietnamese:
498 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
499 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
500 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
501 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
502 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
503 u"\u0056\u0069\u1EC7\u0074",
504 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
505
Martin v. Löwis2548c732003-04-18 10:39:54 +0000506 #(L) 3<nen>B<gumi><kinpachi><sensei>
507 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
508 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000509
Martin v. Löwis2548c732003-04-18 10:39:54 +0000510 # (M) <amuro><namie>-with-SUPER-MONKEYS
511 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
512 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
513 u"\u004F\u004E\u004B\u0045\u0059\u0053",
514 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
515
516 # (N) Hello-Another-Way-<sorezore><no><basho>
517 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
518 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
519 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
520 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
521
522 # (O) <hitotsu><yane><no><shita>2
523 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
524 "2-u9tlzr9756bt3uc0v"),
525
526 # (P) Maji<de>Koi<suru>5<byou><mae>
527 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
528 u"\u308B\u0035\u79D2\u524D",
529 "MajiKoi5-783gue6qz075azm5e"),
530
531 # (Q) <pafii>de<runba>
532 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
533 "de-jg4avhby1noc0d"),
534
535 # (R) <sono><supiido><de>
536 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
537 "d9juau41awczczp"),
538
539 # (S) -> $1.00 <-
540 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
541 u"\u003C\u002D",
542 "-> $1.00 <--")
543 ]
544
545for i in punycode_testcases:
546 if len(i)!=2:
547 print repr(i)
548
549class PunycodeTest(unittest.TestCase):
550 def test_encode(self):
551 for uni, puny in punycode_testcases:
552 # Need to convert both strings to lower case, since
553 # some of the extended encodings use upper case, but our
554 # code produces only lower case. Converting just puny to
555 # lower is also insufficient, since some of the input characters
556 # are upper case.
557 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
558
559 def test_decode(self):
560 for uni, puny in punycode_testcases:
561 self.assertEquals(uni, puny.decode("punycode"))
562
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000563class UnicodeInternalTest(unittest.TestCase):
564 def test_bug1251300(self):
565 # Decoding with unicode_internal used to not correctly handle "code
566 # points" above 0x10ffff on UCS-4 builds.
567 if sys.maxunicode > 0xffff:
568 ok = [
569 ("\x00\x10\xff\xff", u"\U0010ffff"),
570 ("\x00\x00\x01\x01", u"\U00000101"),
571 ("", u""),
572 ]
573 not_ok = [
574 "\x7f\xff\xff\xff",
575 "\x80\x00\x00\x00",
576 "\x81\x00\x00\x00",
577 "\x00",
578 "\x00\x00\x00\x00\x00",
579 ]
580 for internal, uni in ok:
581 if sys.byteorder == "little":
582 internal = "".join(reversed(internal))
583 self.assertEquals(uni, internal.decode("unicode_internal"))
584 for internal in not_ok:
585 if sys.byteorder == "little":
586 internal = "".join(reversed(internal))
587 self.assertRaises(UnicodeDecodeError, internal.decode,
588 "unicode_internal")
589
590 def test_decode_error_attributes(self):
591 if sys.maxunicode > 0xffff:
592 try:
593 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
594 except UnicodeDecodeError, ex:
595 self.assertEquals("unicode_internal", ex.encoding)
596 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
597 self.assertEquals(4, ex.start)
598 self.assertEquals(8, ex.end)
599 else:
600 self.fail()
601
602 def test_decode_callback(self):
603 if sys.maxunicode > 0xffff:
604 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
605 decoder = codecs.getdecoder("unicode_internal")
606 ab = u"ab".encode("unicode_internal")
607 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
608 "UnicodeInternalTest")
609 self.assertEquals((u"ab", 12), ignored)
610
Martin v. Löwis2548c732003-04-18 10:39:54 +0000611# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
612nameprep_tests = [
613 # 3.1 Map to nothing.
614 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
615 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
616 '\xb8\x8f\xef\xbb\xbf',
617 'foobarbaz'),
618 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
619 ('CAFE',
620 'cafe'),
621 # 3.3 Case folding 8bit U+00DF (german sharp s).
622 # The original test case is bogus; it says \xc3\xdf
623 ('\xc3\x9f',
624 'ss'),
625 # 3.4 Case folding U+0130 (turkish capital I with dot).
626 ('\xc4\xb0',
627 'i\xcc\x87'),
628 # 3.5 Case folding multibyte U+0143 U+037A.
629 ('\xc5\x83\xcd\xba',
630 '\xc5\x84 \xce\xb9'),
631 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
632 # XXX: skip this as it fails in UCS-2 mode
633 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
634 # 'telc\xe2\x88\x95kg\xcf\x83'),
635 (None, None),
636 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
637 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
638 '\xc7\xb0 a'),
639 # 3.8 Case folding U+1FB7 and normalization.
640 ('\xe1\xbe\xb7',
641 '\xe1\xbe\xb6\xce\xb9'),
642 # 3.9 Self-reverting case folding U+01F0 and normalization.
643 # The original test case is bogus, it says `\xc7\xf0'
644 ('\xc7\xb0',
645 '\xc7\xb0'),
646 # 3.10 Self-reverting case folding U+0390 and normalization.
647 ('\xce\x90',
648 '\xce\x90'),
649 # 3.11 Self-reverting case folding U+03B0 and normalization.
650 ('\xce\xb0',
651 '\xce\xb0'),
652 # 3.12 Self-reverting case folding U+1E96 and normalization.
653 ('\xe1\xba\x96',
654 '\xe1\xba\x96'),
655 # 3.13 Self-reverting case folding U+1F56 and normalization.
656 ('\xe1\xbd\x96',
657 '\xe1\xbd\x96'),
658 # 3.14 ASCII space character U+0020.
659 (' ',
660 ' '),
661 # 3.15 Non-ASCII 8bit space character U+00A0.
662 ('\xc2\xa0',
663 ' '),
664 # 3.16 Non-ASCII multibyte space character U+1680.
665 ('\xe1\x9a\x80',
666 None),
667 # 3.17 Non-ASCII multibyte space character U+2000.
668 ('\xe2\x80\x80',
669 ' '),
670 # 3.18 Zero Width Space U+200b.
671 ('\xe2\x80\x8b',
672 ''),
673 # 3.19 Non-ASCII multibyte space character U+3000.
674 ('\xe3\x80\x80',
675 ' '),
676 # 3.20 ASCII control characters U+0010 U+007F.
677 ('\x10\x7f',
678 '\x10\x7f'),
679 # 3.21 Non-ASCII 8bit control character U+0085.
680 ('\xc2\x85',
681 None),
682 # 3.22 Non-ASCII multibyte control character U+180E.
683 ('\xe1\xa0\x8e',
684 None),
685 # 3.23 Zero Width No-Break Space U+FEFF.
686 ('\xef\xbb\xbf',
687 ''),
688 # 3.24 Non-ASCII control character U+1D175.
689 ('\xf0\x9d\x85\xb5',
690 None),
691 # 3.25 Plane 0 private use character U+F123.
692 ('\xef\x84\xa3',
693 None),
694 # 3.26 Plane 15 private use character U+F1234.
695 ('\xf3\xb1\x88\xb4',
696 None),
697 # 3.27 Plane 16 private use character U+10F234.
698 ('\xf4\x8f\x88\xb4',
699 None),
700 # 3.28 Non-character code point U+8FFFE.
701 ('\xf2\x8f\xbf\xbe',
702 None),
703 # 3.29 Non-character code point U+10FFFF.
704 ('\xf4\x8f\xbf\xbf',
705 None),
706 # 3.30 Surrogate code U+DF42.
707 ('\xed\xbd\x82',
708 None),
709 # 3.31 Non-plain text character U+FFFD.
710 ('\xef\xbf\xbd',
711 None),
712 # 3.32 Ideographic description character U+2FF5.
713 ('\xe2\xbf\xb5',
714 None),
715 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000716 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000717 '\xcc\x81'),
718 # 3.34 Left-to-right mark U+200E.
719 ('\xe2\x80\x8e',
720 None),
721 # 3.35 Deprecated U+202A.
722 ('\xe2\x80\xaa',
723 None),
724 # 3.36 Language tagging character U+E0001.
725 ('\xf3\xa0\x80\x81',
726 None),
727 # 3.37 Language tagging character U+E0042.
728 ('\xf3\xa0\x81\x82',
729 None),
730 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
731 ('foo\xd6\xbebar',
732 None),
733 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
734 ('foo\xef\xb5\x90bar',
735 None),
736 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
737 ('foo\xef\xb9\xb6bar',
738 'foo \xd9\x8ebar'),
739 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
740 ('\xd8\xa71',
741 None),
742 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
743 ('\xd8\xa71\xd8\xa8',
744 '\xd8\xa71\xd8\xa8'),
745 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000746 # Skip this test as we allow unassigned
747 #('\xf3\xa0\x80\x82',
748 # None),
749 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000750 # 3.44 Larger test (shrinking).
751 # Original test case reads \xc3\xdf
752 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
753 '\xaa\xce\xb0\xe2\x80\x80',
754 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
755 # 3.45 Larger test (expanding).
756 # Original test case reads \xc3\x9f
757 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
758 '\x80',
759 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
760 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
761 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
762 ]
763
764
765class NameprepTest(unittest.TestCase):
766 def test_nameprep(self):
767 from encodings.idna import nameprep
768 for pos, (orig, prepped) in enumerate(nameprep_tests):
769 if orig is None:
770 # Skipped
771 continue
772 # The Unicode strings are given in UTF-8
773 orig = unicode(orig, "utf-8")
774 if prepped is None:
775 # Input contains prohibited characters
776 self.assertRaises(UnicodeError, nameprep, orig)
777 else:
778 prepped = unicode(prepped, "utf-8")
779 try:
780 self.assertEquals(nameprep(orig), prepped)
781 except Exception,e:
782 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
783
Walter Dörwald78a0be62006-04-14 18:25:39 +0000784class IDNACodecTest(unittest.TestCase):
785 def test_builtin_decode(self):
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000786 self.assertEquals(unicode("python.org", "idna"), u"python.org")
Walter Dörwald78a0be62006-04-14 18:25:39 +0000787 self.assertEquals(unicode("python.org.", "idna"), u"python.org.")
788 self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
789 self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
790
791 def test_builtin_encode(self):
792 self.assertEquals(u"python.org".encode("idna"), "python.org")
793 self.assertEquals("python.org.".encode("idna"), "python.org.")
794 self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
795 self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000796
Martin v. Löwis8b595142005-08-25 11:03:38 +0000797 def test_stream(self):
798 import StringIO
799 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
800 r.read(3)
801 self.assertEquals(r.read(), u"")
802
Walter Dörwald78a0be62006-04-14 18:25:39 +0000803 def test_incremental_decode(self):
804 self.assertEquals(
805 "".join(codecs.iterdecode("python.org", "idna")),
806 u"python.org"
807 )
808 self.assertEquals(
809 "".join(codecs.iterdecode("python.org.", "idna")),
810 u"python.org."
811 )
812 self.assertEquals(
813 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
814 u"pyth\xf6n.org."
815 )
816 self.assertEquals(
817 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
818 u"pyth\xf6n.org."
819 )
820
821 decoder = codecs.getincrementaldecoder("idna")()
822 self.assertEquals(decoder.decode("xn--xam", ), u"")
823 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
824 self.assertEquals(decoder.decode(u"rg"), u"")
825 self.assertEquals(decoder.decode(u"", True), u"org")
826
827 decoder.reset()
828 self.assertEquals(decoder.decode("xn--xam", ), u"")
829 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
830 self.assertEquals(decoder.decode("rg."), u"org.")
831 self.assertEquals(decoder.decode("", True), u"")
832
833 def test_incremental_encode(self):
834 self.assertEquals(
835 "".join(codecs.iterencode(u"python.org", "idna")),
836 "python.org"
837 )
838 self.assertEquals(
839 "".join(codecs.iterencode(u"python.org.", "idna")),
840 "python.org."
841 )
842 self.assertEquals(
843 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
844 "xn--pythn-mua.org."
845 )
846 self.assertEquals(
847 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
848 "xn--pythn-mua.org."
849 )
850
851 encoder = codecs.getincrementalencoder("idna")()
852 self.assertEquals(encoder.encode(u"\xe4x"), "")
853 self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
854 self.assertEquals(encoder.encode(u"", True), "org")
855
856 encoder.reset()
857 self.assertEquals(encoder.encode(u"\xe4x"), "")
858 self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
859 self.assertEquals(encoder.encode(u"", True), "")
860
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000861class CodecsModuleTest(unittest.TestCase):
862
863 def test_decode(self):
864 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
865 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000866 self.assertRaises(TypeError, codecs.decode)
867 self.assertEquals(codecs.decode('abc'), u'abc')
868 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
869
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000870 def test_encode(self):
871 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
872 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000873 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +0000874 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwald063e1e82004-10-28 13:04:26 +0000875 self.assertEquals(codecs.encode(u'abc'), 'abc')
876 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
877
878 def test_register(self):
879 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +0000880 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +0000881
882 def test_lookup(self):
883 self.assertRaises(TypeError, codecs.lookup)
884 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +0000885 self.assertRaises(LookupError, codecs.lookup, " ")
886
887 def test_getencoder(self):
888 self.assertRaises(TypeError, codecs.getencoder)
889 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
890
891 def test_getdecoder(self):
892 self.assertRaises(TypeError, codecs.getdecoder)
893 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
894
895 def test_getreader(self):
896 self.assertRaises(TypeError, codecs.getreader)
897 self.assertRaises(LookupError, codecs.getreader, "__spam__")
898
899 def test_getwriter(self):
900 self.assertRaises(TypeError, codecs.getwriter)
901 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000902
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000903class StreamReaderTest(unittest.TestCase):
904
905 def setUp(self):
906 self.reader = codecs.getreader('utf-8')
907 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
908
909 def test_readlines(self):
910 f = self.reader(self.stream)
911 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
912
Georg Brandl8f99f812006-10-29 08:39:22 +0000913class EncodedFileTest(unittest.TestCase):
914
915 def test_basic(self):
916 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +0000917 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
918 self.assertEquals(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +0000919
920 f = StringIO.StringIO()
921 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
922 ef.write('\xc3\xbc')
923 self.assertEquals(f.getvalue(), '\xfc')
924
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000925class Str2StrTest(unittest.TestCase):
926
927 def test_read(self):
928 sin = "\x80".encode("base64_codec")
929 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
930 sout = reader.read()
931 self.assertEqual(sout, "\x80")
932 self.assert_(isinstance(sout, str))
933
934 def test_readline(self):
935 sin = "\x80".encode("base64_codec")
936 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
937 sout = reader.readline()
938 self.assertEqual(sout, "\x80")
939 self.assert_(isinstance(sout, str))
940
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000941all_unicode_encodings = [
942 "ascii",
943 "base64_codec",
944 "big5",
945 "big5hkscs",
946 "charmap",
947 "cp037",
948 "cp1006",
949 "cp1026",
950 "cp1140",
951 "cp1250",
952 "cp1251",
953 "cp1252",
954 "cp1253",
955 "cp1254",
956 "cp1255",
957 "cp1256",
958 "cp1257",
959 "cp1258",
960 "cp424",
961 "cp437",
962 "cp500",
963 "cp737",
964 "cp775",
965 "cp850",
966 "cp852",
967 "cp855",
968 "cp856",
969 "cp857",
970 "cp860",
971 "cp861",
972 "cp862",
973 "cp863",
974 "cp864",
975 "cp865",
976 "cp866",
977 "cp869",
978 "cp874",
979 "cp875",
980 "cp932",
981 "cp949",
982 "cp950",
983 "euc_jis_2004",
984 "euc_jisx0213",
985 "euc_jp",
986 "euc_kr",
987 "gb18030",
988 "gb2312",
989 "gbk",
990 "hex_codec",
991 "hp_roman8",
992 "hz",
993 "idna",
994 "iso2022_jp",
995 "iso2022_jp_1",
996 "iso2022_jp_2",
997 "iso2022_jp_2004",
998 "iso2022_jp_3",
999 "iso2022_jp_ext",
1000 "iso2022_kr",
1001 "iso8859_1",
1002 "iso8859_10",
1003 "iso8859_11",
1004 "iso8859_13",
1005 "iso8859_14",
1006 "iso8859_15",
1007 "iso8859_16",
1008 "iso8859_2",
1009 "iso8859_3",
1010 "iso8859_4",
1011 "iso8859_5",
1012 "iso8859_6",
1013 "iso8859_7",
1014 "iso8859_8",
1015 "iso8859_9",
1016 "johab",
1017 "koi8_r",
1018 "koi8_u",
1019 "latin_1",
1020 "mac_cyrillic",
1021 "mac_greek",
1022 "mac_iceland",
1023 "mac_latin2",
1024 "mac_roman",
1025 "mac_turkish",
1026 "palmos",
1027 "ptcp154",
1028 "punycode",
1029 "raw_unicode_escape",
1030 "rot_13",
1031 "shift_jis",
1032 "shift_jis_2004",
1033 "shift_jisx0213",
1034 "tis_620",
1035 "unicode_escape",
1036 "unicode_internal",
1037 "utf_16",
1038 "utf_16_be",
1039 "utf_16_le",
1040 "utf_7",
1041 "utf_8",
1042]
1043
1044if hasattr(codecs, "mbcs_encode"):
1045 all_unicode_encodings.append("mbcs")
1046
1047# The following encodings work only with str, not unicode
1048all_string_encodings = [
1049 "quopri_codec",
1050 "string_escape",
1051 "uu_codec",
1052]
1053
1054# The following encoding is not tested, because it's not supposed
1055# to work:
1056# "undefined"
1057
1058# The following encodings don't work in stateful mode
1059broken_unicode_with_streams = [
1060 "base64_codec",
1061 "hex_codec",
1062 "punycode",
1063 "unicode_internal"
1064]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001065broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001066
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001067# The following encodings only support "strict" mode
1068only_strict_mode = [
1069 "idna",
1070 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001071 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001072]
1073
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001074try:
1075 import bz2
1076except ImportError:
1077 pass
1078else:
1079 all_unicode_encodings.append("bz2_codec")
1080 broken_unicode_with_streams.append("bz2_codec")
1081
1082try:
1083 import zlib
1084except ImportError:
1085 pass
1086else:
1087 all_unicode_encodings.append("zlib_codec")
1088 broken_unicode_with_streams.append("zlib_codec")
1089
1090class BasicUnicodeTest(unittest.TestCase):
1091 def test_basics(self):
1092 s = u"abc123" # all codecs should be able to encode these
1093 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001094 name = codecs.lookup(encoding).name
1095 if encoding.endswith("_codec"):
1096 name += "_codec"
1097 elif encoding == "latin_1":
1098 name = "latin_1"
1099 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001100 (bytes, size) = codecs.getencoder(encoding)(s)
1101 if encoding != "unicode_internal":
1102 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1103 (chars, size) = codecs.getdecoder(encoding)(bytes)
1104 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1105
1106 if encoding not in broken_unicode_with_streams:
1107 # check stream reader/writer
1108 q = Queue()
1109 writer = codecs.getwriter(encoding)(q)
1110 encodedresult = ""
1111 for c in s:
1112 writer.write(c)
1113 encodedresult += q.read()
1114 q = Queue()
1115 reader = codecs.getreader(encoding)(q)
1116 decodedresult = u""
1117 for c in encodedresult:
1118 q.write(c)
1119 decodedresult += reader.read()
1120 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1121
Georg Brandl2c9838e2006-10-29 14:39:09 +00001122 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001123 # check incremental decoder/encoder (fetched via the Python
1124 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001125 try:
1126 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001127 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001128 except LookupError: # no IncrementalEncoder
1129 pass
1130 else:
1131 # check incremental decoder/encoder
1132 encodedresult = ""
1133 for c in s:
1134 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001135 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001136 decoder = codecs.getincrementaldecoder(encoding)()
1137 decodedresult = u""
1138 for c in encodedresult:
1139 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001140 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001141 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1142
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001143 # check C API
1144 encodedresult = ""
1145 for c in s:
1146 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001147 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001148 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1149 decodedresult = u""
1150 for c in encodedresult:
1151 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001152 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001153 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1154
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001155 # check iterencode()/iterdecode()
1156 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1157 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1158
1159 # check iterencode()/iterdecode() with empty string
1160 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1161 self.assertEqual(result, u"")
1162
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001163 if encoding not in only_strict_mode:
1164 # check incremental decoder/encoder with errors argument
1165 try:
1166 encoder = codecs.getincrementalencoder(encoding)("ignore")
1167 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1168 except LookupError: # no IncrementalEncoder
1169 pass
1170 else:
1171 encodedresult = "".join(encoder.encode(c) for c in s)
1172 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1173 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1174 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1175
1176 encodedresult = "".join(cencoder.encode(c) for c in s)
1177 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1178 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1179 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1180
Walter Dörwald729c31f2005-03-14 19:06:30 +00001181 def test_seek(self):
1182 # all codecs should be able to encode these
1183 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1184 for encoding in all_unicode_encodings:
1185 if encoding == "idna": # FIXME: See SF bug #1163178
1186 continue
1187 if encoding in broken_unicode_with_streams:
1188 continue
1189 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1190 for t in xrange(5):
1191 # Test that calling seek resets the internal codec state and buffers
1192 reader.seek(0, 0)
1193 line = reader.readline()
1194 self.assertEqual(s[:len(line)], line)
1195
Walter Dörwalde22d3392005-11-17 08:52:34 +00001196 def test_bad_decode_args(self):
1197 for encoding in all_unicode_encodings:
1198 decoder = codecs.getdecoder(encoding)
1199 self.assertRaises(TypeError, decoder)
1200 if encoding not in ("idna", "punycode"):
1201 self.assertRaises(TypeError, decoder, 42)
1202
1203 def test_bad_encode_args(self):
1204 for encoding in all_unicode_encodings:
1205 encoder = codecs.getencoder(encoding)
1206 self.assertRaises(TypeError, encoder)
1207
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001208 def test_encoding_map_type_initialized(self):
1209 from encodings import cp1140
1210 # This used to crash, we are only verifying there's no crash.
1211 table_type = type(cp1140.encoding_table)
1212 self.assertEqual(table_type, table_type)
1213
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001214class BasicStrTest(unittest.TestCase):
1215 def test_basics(self):
1216 s = "abc123"
1217 for encoding in all_string_encodings:
1218 (bytes, size) = codecs.getencoder(encoding)(s)
1219 self.assertEqual(size, len(s))
1220 (chars, size) = codecs.getdecoder(encoding)(bytes)
1221 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1222
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001223class CharmapTest(unittest.TestCase):
1224 def test_decode_with_string_map(self):
1225 self.assertEquals(
1226 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1227 (u"abc", 3)
1228 )
1229
1230 self.assertEquals(
1231 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1232 (u"ab\ufffd", 3)
1233 )
1234
1235 self.assertEquals(
1236 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1237 (u"ab\ufffd", 3)
1238 )
1239
1240 self.assertEquals(
1241 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1242 (u"ab", 3)
1243 )
1244
1245 self.assertEquals(
1246 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1247 (u"ab", 3)
1248 )
1249
1250 allbytes = "".join(chr(i) for i in xrange(256))
1251 self.assertEquals(
1252 codecs.charmap_decode(allbytes, "ignore", u""),
1253 (u"", len(allbytes))
1254 )
1255
Georg Brandl8f99f812006-10-29 08:39:22 +00001256class WithStmtTest(unittest.TestCase):
1257 def test_encodedfile(self):
1258 f = StringIO.StringIO("\xc3\xbc")
1259 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1260 self.assertEquals(ef.read(), "\xfc")
1261
1262 def test_streamreaderwriter(self):
1263 f = StringIO.StringIO("\xc3\xbc")
1264 info = codecs.lookup("utf-8")
1265 with codecs.StreamReaderWriter(f, info.streamreader,
1266 info.streamwriter, 'strict') as srw:
1267 self.assertEquals(srw.read(), u"\xfc")
1268
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001269
Fred Drake2e2be372001-09-20 21:33:42 +00001270def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001271 test_support.run_unittest(
1272 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001273 UTF16LETest,
1274 UTF16BETest,
1275 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001276 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001277 UTF7Test,
1278 UTF16ExTest,
1279 ReadBufferTest,
1280 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001281 EscapeDecodeTest,
1282 RecodingTest,
1283 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001284 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001285 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00001286 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001287 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001288 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001289 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001290 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001291 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001292 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001293 CharmapTest,
1294 WithStmtTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001295 )
Fred Drake2e2be372001-09-20 21:33:42 +00001296
1297
1298if __name__ == "__main__":
1299 test_main()