blob: 185670bb19e817a5cd7ad5c858c95f10eb8ab90e [file] [log] [blame]
Georg Brandl2a5a3022006-10-29 08:39:27 +00001from __future__ import with_statement
Barry Warsaw04f357c2002-07-23 19:04:11 +00002from test import test_support
3import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00004import codecs
Walter Dörwald9ae019b2006-03-18 14:22:26 +00005import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Walter Dörwald69652032004-09-07 20:24:22 +00007class Queue(object):
8 """
9 queue: write bytes at one end, read bytes from the other end
10 """
11 def __init__(self):
12 self._buffer = ""
13
14 def write(self, chars):
15 self._buffer += chars
16
17 def read(self, size=-1):
18 if size<0:
19 s = self._buffer
20 self._buffer = ""
21 return s
22 else:
23 s = self._buffer[:size]
24 self._buffer = self._buffer[size:]
25 return s
26
Walter Dörwalde57d7b12004-12-21 22:24:00 +000027class ReadTest(unittest.TestCase):
28 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000029 # get a StreamReader for the encoding and feed the bytestring version
30 # of input to the reader byte by byte. Read every available from
31 # the StreamReader and check that the results equal the appropriate
32 # entries from partialresults.
33 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000034 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000035 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000036 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000037 q.write(c)
38 result += r.read()
39 self.assertEqual(result, partialresult)
40 # check that there's nothing left in the buffers
41 self.assertEqual(r.read(), u"")
42 self.assertEqual(r.bytebuffer, "")
43 self.assertEqual(r.charbuffer, u"")
44
Walter Dörwaldabb02e52006-03-15 11:35:15 +000045 # do the check again, this time using a incremental decoder
46 d = codecs.getincrementaldecoder(self.encoding)()
47 result = u""
48 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
49 result += d.decode(c)
50 self.assertEqual(result, partialresult)
51 # check that there's nothing left in the buffers
52 self.assertEqual(d.decode("", True), u"")
53 self.assertEqual(d.buffer, "")
54
55 # Check whether the rest method works properly
56 d.reset()
57 result = u""
58 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
59 result += d.decode(c)
60 self.assertEqual(result, partialresult)
61 # check that there's nothing left in the buffers
62 self.assertEqual(d.decode("", True), u"")
63 self.assertEqual(d.buffer, "")
64
65 # check iterdecode()
66 encoded = input.encode(self.encoding)
67 self.assertEqual(
68 input,
69 u"".join(codecs.iterdecode(encoded, self.encoding))
70 )
71
Walter Dörwalde57d7b12004-12-21 22:24:00 +000072 def test_readline(self):
73 def getreader(input):
74 stream = StringIO.StringIO(input.encode(self.encoding))
75 return codecs.getreader(self.encoding)(stream)
76
Walter Dörwaldca199432006-03-06 22:39:12 +000077 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000078 reader = getreader(input)
79 lines = []
80 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000081 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000082 if not line:
83 break
84 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000085 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000086
87 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000088 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
89 sexpectednoends = u"foo|bar|baz|spam|eggs"
90 self.assertEqual(readalllines(s, True), sexpected)
91 self.assertEqual(readalllines(s, False), sexpectednoends)
92 self.assertEqual(readalllines(s, True, 10), sexpected)
93 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000094
95 # Test long lines (multiple calls to read() in readline())
96 vw = []
97 vwo = []
98 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
99 vw.append((i*200)*u"\3042" + lineend)
100 vwo.append((i*200)*u"\3042")
101 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
102 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
103
104 # Test lines where the first read might end with \r, so the
105 # reader has to look ahead whether this is a lone \r or a \r\n
106 for size in xrange(80):
107 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000108 s = 10*(size*u"a" + lineend + u"xxx\n")
109 reader = getreader(s)
110 for i in xrange(10):
111 self.assertEqual(
112 reader.readline(keepends=True),
113 size*u"a" + lineend,
114 )
115 reader = getreader(s)
116 for i in xrange(10):
117 self.assertEqual(
118 reader.readline(keepends=False),
119 size*u"a",
120 )
121
122 def test_bug1175396(self):
123 s = [
124 '<%!--===================================================\r\n',
125 ' BLOG index page: show recent articles,\r\n',
126 ' today\'s articles, or articles of a specific date.\r\n',
127 '========================================================--%>\r\n',
128 '<%@inputencoding="ISO-8859-1"%>\r\n',
129 '<%@pagetemplate=TEMPLATE.y%>\r\n',
130 '<%@import=import frog.util, frog%>\r\n',
131 '<%@import=import frog.objects%>\r\n',
132 '<%@import=from frog.storageerrors import StorageError%>\r\n',
133 '<%\r\n',
134 '\r\n',
135 'import logging\r\n',
136 'log=logging.getLogger("Snakelets.logger")\r\n',
137 '\r\n',
138 '\r\n',
139 'user=self.SessionCtx.user\r\n',
140 'storageEngine=self.SessionCtx.storageEngine\r\n',
141 '\r\n',
142 '\r\n',
143 'def readArticlesFromDate(date, count=None):\r\n',
144 ' entryids=storageEngine.listBlogEntries(date)\r\n',
145 ' entryids.reverse() # descending\r\n',
146 ' if count:\r\n',
147 ' entryids=entryids[:count]\r\n',
148 ' try:\r\n',
149 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
150 ' except StorageError,x:\r\n',
151 ' log.error("Error loading articles: "+str(x))\r\n',
152 ' self.abort("cannot load articles")\r\n',
153 '\r\n',
154 'showdate=None\r\n',
155 '\r\n',
156 'arg=self.Request.getArg()\r\n',
157 'if arg=="today":\r\n',
158 ' #-------------------- TODAY\'S ARTICLES\r\n',
159 ' self.write("<h2>Today\'s articles</h2>")\r\n',
160 ' showdate = frog.util.isodatestr() \r\n',
161 ' entries = readArticlesFromDate(showdate)\r\n',
162 'elif arg=="active":\r\n',
163 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
164 ' self.Yredirect("active.y")\r\n',
165 'elif arg=="login":\r\n',
166 ' #-------------------- LOGIN PAGE redirect\r\n',
167 ' self.Yredirect("login.y")\r\n',
168 'elif arg=="date":\r\n',
169 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
170 ' showdate = self.Request.getParameter("date")\r\n',
171 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
172 ' entries = readArticlesFromDate(showdate)\r\n',
173 'else:\r\n',
174 ' #-------------------- RECENT ARTICLES\r\n',
175 ' self.write("<h2>Recent articles</h2>")\r\n',
176 ' dates=storageEngine.listBlogEntryDates()\r\n',
177 ' if dates:\r\n',
178 ' entries=[]\r\n',
179 ' SHOWAMOUNT=10\r\n',
180 ' for showdate in dates:\r\n',
181 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
182 ' if len(entries)>=SHOWAMOUNT:\r\n',
183 ' break\r\n',
184 ' \r\n',
185 ]
186 stream = StringIO.StringIO("".join(s).encode(self.encoding))
187 reader = codecs.getreader(self.encoding)(stream)
188 for (i, line) in enumerate(reader):
189 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000190
191 def test_readlinequeue(self):
192 q = Queue()
193 writer = codecs.getwriter(self.encoding)(q)
194 reader = codecs.getreader(self.encoding)(q)
195
196 # No lineends
197 writer.write(u"foo\r")
198 self.assertEqual(reader.readline(keepends=False), u"foo")
199 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000200 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000201 self.assertEqual(reader.readline(keepends=False), u"bar")
202 writer.write(u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"baz")
204 self.assertEqual(reader.readline(keepends=False), u"")
205
206 # Lineends
207 writer.write(u"foo\r")
208 self.assertEqual(reader.readline(keepends=True), u"foo\r")
209 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000210 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000211 self.assertEqual(reader.readline(keepends=True), u"bar\r")
212 writer.write(u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"baz")
214 self.assertEqual(reader.readline(keepends=True), u"")
215 writer.write(u"foo\r\n")
216 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
217
Walter Dörwald9fa09462005-01-10 12:01:39 +0000218 def test_bug1098990_a(self):
219 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
220 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
221 s3 = u"next line.\r\n"
222
223 s = (s1+s2+s3).encode(self.encoding)
224 stream = StringIO.StringIO(s)
225 reader = codecs.getreader(self.encoding)(stream)
226 self.assertEqual(reader.readline(), s1)
227 self.assertEqual(reader.readline(), s2)
228 self.assertEqual(reader.readline(), s3)
229 self.assertEqual(reader.readline(), u"")
230
231 def test_bug1098990_b(self):
232 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
233 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
234 s3 = u"stillokay:bbbbxx\r\n"
235 s4 = u"broken!!!!badbad\r\n"
236 s5 = u"againokay.\r\n"
237
238 s = (s1+s2+s3+s4+s5).encode(self.encoding)
239 stream = StringIO.StringIO(s)
240 reader = codecs.getreader(self.encoding)(stream)
241 self.assertEqual(reader.readline(), s1)
242 self.assertEqual(reader.readline(), s2)
243 self.assertEqual(reader.readline(), s3)
244 self.assertEqual(reader.readline(), s4)
245 self.assertEqual(reader.readline(), s5)
246 self.assertEqual(reader.readline(), u"")
247
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000248class UTF16Test(ReadTest):
249 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000250
251 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
252 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
253
254 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000255 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000256 # encode some stream
257 s = StringIO.StringIO()
258 f = writer(s)
259 f.write(u"spam")
260 f.write(u"spam")
261 d = s.getvalue()
262 # check whether there is exactly one BOM in it
263 self.assert_(d == self.spamle or d == self.spambe)
264 # try to read it back
265 s = StringIO.StringIO(d)
266 f = reader(s)
267 self.assertEquals(f.read(), u"spamspam")
268
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000269 def test_badbom(self):
270 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000271 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000272 self.assertRaises(UnicodeError, f.read)
273
274 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000275 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000276 self.assertRaises(UnicodeError, f.read)
277
Walter Dörwald69652032004-09-07 20:24:22 +0000278 def test_partial(self):
279 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000280 u"\x00\xff\u0100\uffff",
281 [
282 u"", # first byte of BOM read
283 u"", # second byte of BOM read => byteorder known
284 u"",
285 u"\x00",
286 u"\x00",
287 u"\x00\xff",
288 u"\x00\xff",
289 u"\x00\xff\u0100",
290 u"\x00\xff\u0100",
291 u"\x00\xff\u0100\uffff",
292 ]
293 )
294
Walter Dörwalde22d3392005-11-17 08:52:34 +0000295 def test_errors(self):
296 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
297
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000298class UTF16LETest(ReadTest):
299 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000300
301 def test_partial(self):
302 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000303 u"\x00\xff\u0100\uffff",
304 [
305 u"",
306 u"\x00",
307 u"\x00",
308 u"\x00\xff",
309 u"\x00\xff",
310 u"\x00\xff\u0100",
311 u"\x00\xff\u0100",
312 u"\x00\xff\u0100\uffff",
313 ]
314 )
315
Walter Dörwalde22d3392005-11-17 08:52:34 +0000316 def test_errors(self):
317 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
318
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000319class UTF16BETest(ReadTest):
320 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000321
322 def test_partial(self):
323 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000324 u"\x00\xff\u0100\uffff",
325 [
326 u"",
327 u"\x00",
328 u"\x00",
329 u"\x00\xff",
330 u"\x00\xff",
331 u"\x00\xff\u0100",
332 u"\x00\xff\u0100",
333 u"\x00\xff\u0100\uffff",
334 ]
335 )
336
Walter Dörwalde22d3392005-11-17 08:52:34 +0000337 def test_errors(self):
338 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
339
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000340class UTF8Test(ReadTest):
341 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000342
343 def test_partial(self):
344 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000345 u"\x00\xff\u07ff\u0800\uffff",
346 [
347 u"\x00",
348 u"\x00",
349 u"\x00\xff",
350 u"\x00\xff",
351 u"\x00\xff\u07ff",
352 u"\x00\xff\u07ff",
353 u"\x00\xff\u07ff",
354 u"\x00\xff\u07ff\u0800",
355 u"\x00\xff\u07ff\u0800",
356 u"\x00\xff\u07ff\u0800",
357 u"\x00\xff\u07ff\u0800\uffff",
358 ]
359 )
360
Walter Dörwalde22d3392005-11-17 08:52:34 +0000361class UTF7Test(ReadTest):
362 encoding = "utf-7"
363
364 # No test_partial() yet, because UTF-7 doesn't support it.
365
366class UTF16ExTest(unittest.TestCase):
367
368 def test_errors(self):
369 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
370
371 def test_bad_args(self):
372 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
373
374class ReadBufferTest(unittest.TestCase):
375
376 def test_array(self):
377 import array
378 self.assertEqual(
379 codecs.readbuffer_encode(array.array("c", "spam")),
380 ("spam", 4)
381 )
382
383 def test_empty(self):
384 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
385
386 def test_bad_args(self):
387 self.assertRaises(TypeError, codecs.readbuffer_encode)
388 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
389
390class CharBufferTest(unittest.TestCase):
391
392 def test_string(self):
393 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
394
395 def test_empty(self):
396 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
397
398 def test_bad_args(self):
399 self.assertRaises(TypeError, codecs.charbuffer_encode)
400 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
401
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000402class UTF8SigTest(ReadTest):
403 encoding = "utf-8-sig"
404
405 def test_partial(self):
406 self.check_partial(
407 u"\ufeff\x00\xff\u07ff\u0800\uffff",
408 [
409 u"",
410 u"",
411 u"", # First BOM has been read and skipped
412 u"",
413 u"",
414 u"\ufeff", # Second BOM has been read and emitted
415 u"\ufeff\x00", # "\x00" read and emitted
416 u"\ufeff\x00", # First byte of encoded u"\xff" read
417 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
418 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
419 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
420 u"\ufeff\x00\xff\u07ff",
421 u"\ufeff\x00\xff\u07ff",
422 u"\ufeff\x00\xff\u07ff\u0800",
423 u"\ufeff\x00\xff\u07ff\u0800",
424 u"\ufeff\x00\xff\u07ff\u0800",
425 u"\ufeff\x00\xff\u07ff\u0800\uffff",
426 ]
427 )
428
Walter Dörwald9ff1d392006-11-23 05:06:31 +0000429 def test_bug1601501(self):
430 # SF bug #1601501: check that the codec works with a buffer
431 unicode("\xef\xbb\xbf", "utf-8-sig")
432
Walter Dörwald8709a422002-09-03 13:53:40 +0000433class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000434 def test_empty(self):
Walter Dörwald8709a422002-09-03 13:53:40 +0000435 self.assertEquals(codecs.escape_decode(""), ("", 0))
436
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000437class RecodingTest(unittest.TestCase):
438 def test_recoding(self):
439 f = StringIO.StringIO()
440 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
441 f2.write(u"a")
442 f2.close()
443 # Python used to crash on this at exit because of a refcount
444 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000445
Martin v. Löwis2548c732003-04-18 10:39:54 +0000446# From RFC 3492
447punycode_testcases = [
448 # A Arabic (Egyptian):
449 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
450 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
451 "egbpdaj6bu4bxfgehfvwxn"),
452 # B Chinese (simplified):
453 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
454 "ihqwcrb4cv8a8dqg056pqjye"),
455 # C Chinese (traditional):
456 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
457 "ihqwctvzc91f659drss3x8bo0yb"),
458 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
459 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
460 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
461 u"\u0065\u0073\u006B\u0079",
462 "Proprostnemluvesky-uyb24dma41a"),
463 # E Hebrew:
464 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
465 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
466 u"\u05D1\u05E8\u05D9\u05EA",
467 "4dbcagdahymbxekheh6e0a7fei0b"),
468 # F Hindi (Devanagari):
469 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
470 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
471 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
472 u"\u0939\u0948\u0902",
473 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
474
475 #(G) Japanese (kanji and hiragana):
476 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
477 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
478 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
479
480 # (H) Korean (Hangul syllables):
481 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
482 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
483 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
484 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
485 "psd879ccm6fea98c"),
486
487 # (I) Russian (Cyrillic):
488 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
489 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
490 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
491 u"\u0438",
492 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
493
494 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
495 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
496 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
497 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
498 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
499 u"\u0061\u00F1\u006F\u006C",
500 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
501
502 # (K) Vietnamese:
503 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
504 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
505 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
506 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
507 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
508 u"\u0056\u0069\u1EC7\u0074",
509 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
510
Martin v. Löwis2548c732003-04-18 10:39:54 +0000511 #(L) 3<nen>B<gumi><kinpachi><sensei>
512 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
513 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000514
Martin v. Löwis2548c732003-04-18 10:39:54 +0000515 # (M) <amuro><namie>-with-SUPER-MONKEYS
516 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
517 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
518 u"\u004F\u004E\u004B\u0045\u0059\u0053",
519 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
520
521 # (N) Hello-Another-Way-<sorezore><no><basho>
522 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
523 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
524 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
525 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
526
527 # (O) <hitotsu><yane><no><shita>2
528 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
529 "2-u9tlzr9756bt3uc0v"),
530
531 # (P) Maji<de>Koi<suru>5<byou><mae>
532 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
533 u"\u308B\u0035\u79D2\u524D",
534 "MajiKoi5-783gue6qz075azm5e"),
535
536 # (Q) <pafii>de<runba>
537 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
538 "de-jg4avhby1noc0d"),
539
540 # (R) <sono><supiido><de>
541 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
542 "d9juau41awczczp"),
543
544 # (S) -> $1.00 <-
545 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
546 u"\u003C\u002D",
547 "-> $1.00 <--")
548 ]
549
550for i in punycode_testcases:
551 if len(i)!=2:
552 print repr(i)
553
554class PunycodeTest(unittest.TestCase):
555 def test_encode(self):
556 for uni, puny in punycode_testcases:
557 # Need to convert both strings to lower case, since
558 # some of the extended encodings use upper case, but our
559 # code produces only lower case. Converting just puny to
560 # lower is also insufficient, since some of the input characters
561 # are upper case.
562 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
563
564 def test_decode(self):
565 for uni, puny in punycode_testcases:
566 self.assertEquals(uni, puny.decode("punycode"))
567
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000568class UnicodeInternalTest(unittest.TestCase):
569 def test_bug1251300(self):
570 # Decoding with unicode_internal used to not correctly handle "code
571 # points" above 0x10ffff on UCS-4 builds.
572 if sys.maxunicode > 0xffff:
573 ok = [
574 ("\x00\x10\xff\xff", u"\U0010ffff"),
575 ("\x00\x00\x01\x01", u"\U00000101"),
576 ("", u""),
577 ]
578 not_ok = [
579 "\x7f\xff\xff\xff",
580 "\x80\x00\x00\x00",
581 "\x81\x00\x00\x00",
582 "\x00",
583 "\x00\x00\x00\x00\x00",
584 ]
585 for internal, uni in ok:
586 if sys.byteorder == "little":
587 internal = "".join(reversed(internal))
588 self.assertEquals(uni, internal.decode("unicode_internal"))
589 for internal in not_ok:
590 if sys.byteorder == "little":
591 internal = "".join(reversed(internal))
592 self.assertRaises(UnicodeDecodeError, internal.decode,
593 "unicode_internal")
594
595 def test_decode_error_attributes(self):
596 if sys.maxunicode > 0xffff:
597 try:
598 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
599 except UnicodeDecodeError, ex:
600 self.assertEquals("unicode_internal", ex.encoding)
601 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
602 self.assertEquals(4, ex.start)
603 self.assertEquals(8, ex.end)
604 else:
605 self.fail()
606
607 def test_decode_callback(self):
608 if sys.maxunicode > 0xffff:
609 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
610 decoder = codecs.getdecoder("unicode_internal")
611 ab = u"ab".encode("unicode_internal")
612 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
613 "UnicodeInternalTest")
614 self.assertEquals((u"ab", 12), ignored)
615
Martin v. Löwis2548c732003-04-18 10:39:54 +0000616# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
617nameprep_tests = [
618 # 3.1 Map to nothing.
619 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
620 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
621 '\xb8\x8f\xef\xbb\xbf',
622 'foobarbaz'),
623 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
624 ('CAFE',
625 'cafe'),
626 # 3.3 Case folding 8bit U+00DF (german sharp s).
627 # The original test case is bogus; it says \xc3\xdf
628 ('\xc3\x9f',
629 'ss'),
630 # 3.4 Case folding U+0130 (turkish capital I with dot).
631 ('\xc4\xb0',
632 'i\xcc\x87'),
633 # 3.5 Case folding multibyte U+0143 U+037A.
634 ('\xc5\x83\xcd\xba',
635 '\xc5\x84 \xce\xb9'),
636 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
637 # XXX: skip this as it fails in UCS-2 mode
638 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
639 # 'telc\xe2\x88\x95kg\xcf\x83'),
640 (None, None),
641 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
642 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
643 '\xc7\xb0 a'),
644 # 3.8 Case folding U+1FB7 and normalization.
645 ('\xe1\xbe\xb7',
646 '\xe1\xbe\xb6\xce\xb9'),
647 # 3.9 Self-reverting case folding U+01F0 and normalization.
648 # The original test case is bogus, it says `\xc7\xf0'
649 ('\xc7\xb0',
650 '\xc7\xb0'),
651 # 3.10 Self-reverting case folding U+0390 and normalization.
652 ('\xce\x90',
653 '\xce\x90'),
654 # 3.11 Self-reverting case folding U+03B0 and normalization.
655 ('\xce\xb0',
656 '\xce\xb0'),
657 # 3.12 Self-reverting case folding U+1E96 and normalization.
658 ('\xe1\xba\x96',
659 '\xe1\xba\x96'),
660 # 3.13 Self-reverting case folding U+1F56 and normalization.
661 ('\xe1\xbd\x96',
662 '\xe1\xbd\x96'),
663 # 3.14 ASCII space character U+0020.
664 (' ',
665 ' '),
666 # 3.15 Non-ASCII 8bit space character U+00A0.
667 ('\xc2\xa0',
668 ' '),
669 # 3.16 Non-ASCII multibyte space character U+1680.
670 ('\xe1\x9a\x80',
671 None),
672 # 3.17 Non-ASCII multibyte space character U+2000.
673 ('\xe2\x80\x80',
674 ' '),
675 # 3.18 Zero Width Space U+200b.
676 ('\xe2\x80\x8b',
677 ''),
678 # 3.19 Non-ASCII multibyte space character U+3000.
679 ('\xe3\x80\x80',
680 ' '),
681 # 3.20 ASCII control characters U+0010 U+007F.
682 ('\x10\x7f',
683 '\x10\x7f'),
684 # 3.21 Non-ASCII 8bit control character U+0085.
685 ('\xc2\x85',
686 None),
687 # 3.22 Non-ASCII multibyte control character U+180E.
688 ('\xe1\xa0\x8e',
689 None),
690 # 3.23 Zero Width No-Break Space U+FEFF.
691 ('\xef\xbb\xbf',
692 ''),
693 # 3.24 Non-ASCII control character U+1D175.
694 ('\xf0\x9d\x85\xb5',
695 None),
696 # 3.25 Plane 0 private use character U+F123.
697 ('\xef\x84\xa3',
698 None),
699 # 3.26 Plane 15 private use character U+F1234.
700 ('\xf3\xb1\x88\xb4',
701 None),
702 # 3.27 Plane 16 private use character U+10F234.
703 ('\xf4\x8f\x88\xb4',
704 None),
705 # 3.28 Non-character code point U+8FFFE.
706 ('\xf2\x8f\xbf\xbe',
707 None),
708 # 3.29 Non-character code point U+10FFFF.
709 ('\xf4\x8f\xbf\xbf',
710 None),
711 # 3.30 Surrogate code U+DF42.
712 ('\xed\xbd\x82',
713 None),
714 # 3.31 Non-plain text character U+FFFD.
715 ('\xef\xbf\xbd',
716 None),
717 # 3.32 Ideographic description character U+2FF5.
718 ('\xe2\xbf\xb5',
719 None),
720 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000721 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000722 '\xcc\x81'),
723 # 3.34 Left-to-right mark U+200E.
724 ('\xe2\x80\x8e',
725 None),
726 # 3.35 Deprecated U+202A.
727 ('\xe2\x80\xaa',
728 None),
729 # 3.36 Language tagging character U+E0001.
730 ('\xf3\xa0\x80\x81',
731 None),
732 # 3.37 Language tagging character U+E0042.
733 ('\xf3\xa0\x81\x82',
734 None),
735 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
736 ('foo\xd6\xbebar',
737 None),
738 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
739 ('foo\xef\xb5\x90bar',
740 None),
741 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
742 ('foo\xef\xb9\xb6bar',
743 'foo \xd9\x8ebar'),
744 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
745 ('\xd8\xa71',
746 None),
747 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
748 ('\xd8\xa71\xd8\xa8',
749 '\xd8\xa71\xd8\xa8'),
750 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000751 # Skip this test as we allow unassigned
752 #('\xf3\xa0\x80\x82',
753 # None),
754 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000755 # 3.44 Larger test (shrinking).
756 # Original test case reads \xc3\xdf
757 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
758 '\xaa\xce\xb0\xe2\x80\x80',
759 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
760 # 3.45 Larger test (expanding).
761 # Original test case reads \xc3\x9f
762 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
763 '\x80',
764 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
765 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
766 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
767 ]
768
769
770class NameprepTest(unittest.TestCase):
771 def test_nameprep(self):
772 from encodings.idna import nameprep
773 for pos, (orig, prepped) in enumerate(nameprep_tests):
774 if orig is None:
775 # Skipped
776 continue
777 # The Unicode strings are given in UTF-8
778 orig = unicode(orig, "utf-8")
779 if prepped is None:
780 # Input contains prohibited characters
781 self.assertRaises(UnicodeError, nameprep, orig)
782 else:
783 prepped = unicode(prepped, "utf-8")
784 try:
785 self.assertEquals(nameprep(orig), prepped)
786 except Exception,e:
787 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
788
Walter Dörwald78a0be62006-04-14 18:25:39 +0000789class IDNACodecTest(unittest.TestCase):
790 def test_builtin_decode(self):
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000791 self.assertEquals(unicode("python.org", "idna"), u"python.org")
Walter Dörwald78a0be62006-04-14 18:25:39 +0000792 self.assertEquals(unicode("python.org.", "idna"), u"python.org.")
793 self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
794 self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
795
796 def test_builtin_encode(self):
797 self.assertEquals(u"python.org".encode("idna"), "python.org")
798 self.assertEquals("python.org.".encode("idna"), "python.org.")
799 self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
800 self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000801
Martin v. Löwis8b595142005-08-25 11:03:38 +0000802 def test_stream(self):
803 import StringIO
804 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
805 r.read(3)
806 self.assertEquals(r.read(), u"")
807
Walter Dörwald78a0be62006-04-14 18:25:39 +0000808 def test_incremental_decode(self):
809 self.assertEquals(
810 "".join(codecs.iterdecode("python.org", "idna")),
811 u"python.org"
812 )
813 self.assertEquals(
814 "".join(codecs.iterdecode("python.org.", "idna")),
815 u"python.org."
816 )
817 self.assertEquals(
818 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
819 u"pyth\xf6n.org."
820 )
821 self.assertEquals(
822 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
823 u"pyth\xf6n.org."
824 )
825
826 decoder = codecs.getincrementaldecoder("idna")()
827 self.assertEquals(decoder.decode("xn--xam", ), u"")
828 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
829 self.assertEquals(decoder.decode(u"rg"), u"")
830 self.assertEquals(decoder.decode(u"", True), u"org")
831
832 decoder.reset()
833 self.assertEquals(decoder.decode("xn--xam", ), u"")
834 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
835 self.assertEquals(decoder.decode("rg."), u"org.")
836 self.assertEquals(decoder.decode("", True), u"")
837
838 def test_incremental_encode(self):
839 self.assertEquals(
840 "".join(codecs.iterencode(u"python.org", "idna")),
841 "python.org"
842 )
843 self.assertEquals(
844 "".join(codecs.iterencode(u"python.org.", "idna")),
845 "python.org."
846 )
847 self.assertEquals(
848 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
849 "xn--pythn-mua.org."
850 )
851 self.assertEquals(
852 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
853 "xn--pythn-mua.org."
854 )
855
856 encoder = codecs.getincrementalencoder("idna")()
857 self.assertEquals(encoder.encode(u"\xe4x"), "")
858 self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
859 self.assertEquals(encoder.encode(u"", True), "org")
860
861 encoder.reset()
862 self.assertEquals(encoder.encode(u"\xe4x"), "")
863 self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
864 self.assertEquals(encoder.encode(u"", True), "")
865
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000866class CodecsModuleTest(unittest.TestCase):
867
868 def test_decode(self):
869 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
870 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000871 self.assertRaises(TypeError, codecs.decode)
872 self.assertEquals(codecs.decode('abc'), u'abc')
873 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
874
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000875 def test_encode(self):
876 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
877 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000878 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +0000879 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwald063e1e82004-10-28 13:04:26 +0000880 self.assertEquals(codecs.encode(u'abc'), 'abc')
881 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
882
883 def test_register(self):
884 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +0000885 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +0000886
887 def test_lookup(self):
888 self.assertRaises(TypeError, codecs.lookup)
889 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +0000890 self.assertRaises(LookupError, codecs.lookup, " ")
891
892 def test_getencoder(self):
893 self.assertRaises(TypeError, codecs.getencoder)
894 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
895
896 def test_getdecoder(self):
897 self.assertRaises(TypeError, codecs.getdecoder)
898 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
899
900 def test_getreader(self):
901 self.assertRaises(TypeError, codecs.getreader)
902 self.assertRaises(LookupError, codecs.getreader, "__spam__")
903
904 def test_getwriter(self):
905 self.assertRaises(TypeError, codecs.getwriter)
906 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000907
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000908class StreamReaderTest(unittest.TestCase):
909
910 def setUp(self):
911 self.reader = codecs.getreader('utf-8')
912 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
913
914 def test_readlines(self):
915 f = self.reader(self.stream)
916 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
917
Georg Brandl2a5a3022006-10-29 08:39:27 +0000918class EncodedFileTest(unittest.TestCase):
919
920 def test_basic(self):
921 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandlb8205a12006-10-29 09:32:19 +0000922 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Georg Brandlf96b1622006-10-29 15:22:43 +0000923 self.assertEquals(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl2a5a3022006-10-29 08:39:27 +0000924
925 f = StringIO.StringIO()
926 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
927 ef.write('\xc3\xbc')
928 self.assertEquals(f.getvalue(), '\xfc')
929
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000930class Str2StrTest(unittest.TestCase):
931
932 def test_read(self):
933 sin = "\x80".encode("base64_codec")
934 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
935 sout = reader.read()
936 self.assertEqual(sout, "\x80")
937 self.assert_(isinstance(sout, str))
938
939 def test_readline(self):
940 sin = "\x80".encode("base64_codec")
941 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
942 sout = reader.readline()
943 self.assertEqual(sout, "\x80")
944 self.assert_(isinstance(sout, str))
945
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000946all_unicode_encodings = [
947 "ascii",
948 "base64_codec",
949 "big5",
950 "big5hkscs",
951 "charmap",
952 "cp037",
953 "cp1006",
954 "cp1026",
955 "cp1140",
956 "cp1250",
957 "cp1251",
958 "cp1252",
959 "cp1253",
960 "cp1254",
961 "cp1255",
962 "cp1256",
963 "cp1257",
964 "cp1258",
965 "cp424",
966 "cp437",
967 "cp500",
968 "cp737",
969 "cp775",
970 "cp850",
971 "cp852",
972 "cp855",
973 "cp856",
974 "cp857",
975 "cp860",
976 "cp861",
977 "cp862",
978 "cp863",
979 "cp864",
980 "cp865",
981 "cp866",
982 "cp869",
983 "cp874",
984 "cp875",
985 "cp932",
986 "cp949",
987 "cp950",
988 "euc_jis_2004",
989 "euc_jisx0213",
990 "euc_jp",
991 "euc_kr",
992 "gb18030",
993 "gb2312",
994 "gbk",
995 "hex_codec",
996 "hp_roman8",
997 "hz",
998 "idna",
999 "iso2022_jp",
1000 "iso2022_jp_1",
1001 "iso2022_jp_2",
1002 "iso2022_jp_2004",
1003 "iso2022_jp_3",
1004 "iso2022_jp_ext",
1005 "iso2022_kr",
1006 "iso8859_1",
1007 "iso8859_10",
1008 "iso8859_11",
1009 "iso8859_13",
1010 "iso8859_14",
1011 "iso8859_15",
1012 "iso8859_16",
1013 "iso8859_2",
1014 "iso8859_3",
1015 "iso8859_4",
1016 "iso8859_5",
1017 "iso8859_6",
1018 "iso8859_7",
1019 "iso8859_8",
1020 "iso8859_9",
1021 "johab",
1022 "koi8_r",
1023 "koi8_u",
1024 "latin_1",
1025 "mac_cyrillic",
1026 "mac_greek",
1027 "mac_iceland",
1028 "mac_latin2",
1029 "mac_roman",
1030 "mac_turkish",
1031 "palmos",
1032 "ptcp154",
1033 "punycode",
1034 "raw_unicode_escape",
1035 "rot_13",
1036 "shift_jis",
1037 "shift_jis_2004",
1038 "shift_jisx0213",
1039 "tis_620",
1040 "unicode_escape",
1041 "unicode_internal",
1042 "utf_16",
1043 "utf_16_be",
1044 "utf_16_le",
1045 "utf_7",
1046 "utf_8",
1047]
1048
1049if hasattr(codecs, "mbcs_encode"):
1050 all_unicode_encodings.append("mbcs")
1051
1052# The following encodings work only with str, not unicode
1053all_string_encodings = [
1054 "quopri_codec",
1055 "string_escape",
1056 "uu_codec",
1057]
1058
1059# The following encoding is not tested, because it's not supposed
1060# to work:
1061# "undefined"
1062
1063# The following encodings don't work in stateful mode
1064broken_unicode_with_streams = [
1065 "base64_codec",
1066 "hex_codec",
1067 "punycode",
1068 "unicode_internal"
1069]
Georg Brandlc68d2cc2006-10-29 14:39:13 +00001070broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001071
1072try:
1073 import bz2
1074except ImportError:
1075 pass
1076else:
1077 all_unicode_encodings.append("bz2_codec")
1078 broken_unicode_with_streams.append("bz2_codec")
1079
1080try:
1081 import zlib
1082except ImportError:
1083 pass
1084else:
1085 all_unicode_encodings.append("zlib_codec")
1086 broken_unicode_with_streams.append("zlib_codec")
1087
1088class BasicUnicodeTest(unittest.TestCase):
1089 def test_basics(self):
1090 s = u"abc123" # all codecs should be able to encode these
1091 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001092 name = codecs.lookup(encoding).name
1093 if encoding.endswith("_codec"):
1094 name += "_codec"
1095 elif encoding == "latin_1":
1096 name = "latin_1"
1097 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001098 (bytes, size) = codecs.getencoder(encoding)(s)
1099 if encoding != "unicode_internal":
1100 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1101 (chars, size) = codecs.getdecoder(encoding)(bytes)
1102 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1103
1104 if encoding not in broken_unicode_with_streams:
1105 # check stream reader/writer
1106 q = Queue()
1107 writer = codecs.getwriter(encoding)(q)
1108 encodedresult = ""
1109 for c in s:
1110 writer.write(c)
1111 encodedresult += q.read()
1112 q = Queue()
1113 reader = codecs.getreader(encoding)(q)
1114 decodedresult = u""
1115 for c in encodedresult:
1116 q.write(c)
1117 decodedresult += reader.read()
1118 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1119
Georg Brandlc68d2cc2006-10-29 14:39:13 +00001120 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001121 # check incremental decoder/encoder (fetched via the Python
1122 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001123 try:
1124 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001125 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001126 except LookupError: # no IncrementalEncoder
1127 pass
1128 else:
1129 # check incremental decoder/encoder
1130 encodedresult = ""
1131 for c in s:
1132 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001133 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001134 decoder = codecs.getincrementaldecoder(encoding)()
1135 decodedresult = u""
1136 for c in encodedresult:
1137 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001138 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001139 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1140
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001141 # check C API
1142 encodedresult = ""
1143 for c in s:
1144 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001145 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001146 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1147 decodedresult = u""
1148 for c in encodedresult:
1149 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001150 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001151 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1152
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001153 # check iterencode()/iterdecode()
1154 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1155 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1156
1157 # check iterencode()/iterdecode() with empty string
1158 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1159 self.assertEqual(result, u"")
1160
Walter Dörwald729c31f2005-03-14 19:06:30 +00001161 def test_seek(self):
1162 # all codecs should be able to encode these
1163 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1164 for encoding in all_unicode_encodings:
1165 if encoding == "idna": # FIXME: See SF bug #1163178
1166 continue
1167 if encoding in broken_unicode_with_streams:
1168 continue
1169 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1170 for t in xrange(5):
1171 # Test that calling seek resets the internal codec state and buffers
1172 reader.seek(0, 0)
1173 line = reader.readline()
1174 self.assertEqual(s[:len(line)], line)
1175
Walter Dörwalde22d3392005-11-17 08:52:34 +00001176 def test_bad_decode_args(self):
1177 for encoding in all_unicode_encodings:
1178 decoder = codecs.getdecoder(encoding)
1179 self.assertRaises(TypeError, decoder)
1180 if encoding not in ("idna", "punycode"):
1181 self.assertRaises(TypeError, decoder, 42)
1182
1183 def test_bad_encode_args(self):
1184 for encoding in all_unicode_encodings:
1185 encoder = codecs.getencoder(encoding)
1186 self.assertRaises(TypeError, encoder)
1187
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001188 def test_encoding_map_type_initialized(self):
1189 from encodings import cp1140
1190 # This used to crash, we are only verifying there's no crash.
1191 table_type = type(cp1140.encoding_table)
1192 self.assertEqual(table_type, table_type)
1193
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001194class BasicStrTest(unittest.TestCase):
1195 def test_basics(self):
1196 s = "abc123"
1197 for encoding in all_string_encodings:
1198 (bytes, size) = codecs.getencoder(encoding)(s)
1199 self.assertEqual(size, len(s))
1200 (chars, size) = codecs.getdecoder(encoding)(bytes)
1201 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1202
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001203class CharmapTest(unittest.TestCase):
1204 def test_decode_with_string_map(self):
1205 self.assertEquals(
1206 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1207 (u"abc", 3)
1208 )
1209
1210 self.assertEquals(
1211 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1212 (u"ab\ufffd", 3)
1213 )
1214
1215 self.assertEquals(
1216 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1217 (u"ab\ufffd", 3)
1218 )
1219
1220 self.assertEquals(
1221 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1222 (u"ab", 3)
1223 )
1224
1225 self.assertEquals(
1226 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1227 (u"ab", 3)
1228 )
1229
1230 allbytes = "".join(chr(i) for i in xrange(256))
1231 self.assertEquals(
1232 codecs.charmap_decode(allbytes, "ignore", u""),
1233 (u"", len(allbytes))
1234 )
1235
Georg Brandl2a5a3022006-10-29 08:39:27 +00001236class WithStmtTest(unittest.TestCase):
1237 def test_encodedfile(self):
1238 f = StringIO.StringIO("\xc3\xbc")
1239 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1240 self.assertEquals(ef.read(), "\xfc")
1241
1242 def test_streamreaderwriter(self):
1243 f = StringIO.StringIO("\xc3\xbc")
1244 info = codecs.lookup("utf-8")
1245 with codecs.StreamReaderWriter(f, info.streamreader,
1246 info.streamwriter, 'strict') as srw:
1247 self.assertEquals(srw.read(), u"\xfc")
1248
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001249
Fred Drake2e2be372001-09-20 21:33:42 +00001250def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001251 test_support.run_unittest(
1252 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001253 UTF16LETest,
1254 UTF16BETest,
1255 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001256 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001257 UTF7Test,
1258 UTF16ExTest,
1259 ReadBufferTest,
1260 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001261 EscapeDecodeTest,
1262 RecodingTest,
1263 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001264 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001265 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00001266 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001267 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001268 StreamReaderTest,
Georg Brandl2a5a3022006-10-29 08:39:27 +00001269 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001270 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001271 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001272 BasicStrTest,
Georg Brandl2a5a3022006-10-29 08:39:27 +00001273 CharmapTest,
1274 WithStmtTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001275 )
Fred Drake2e2be372001-09-20 21:33:42 +00001276
1277
1278if __name__ == "__main__":
1279 test_main()