blob: 90340bbcb047e5594c94ef05c2d7ba39becbc399 [file] [log] [blame]
Georg Brandl2a5a3022006-10-29 08:39:27 +00001from __future__ import with_statement
Barry Warsaw04f357c2002-07-23 19:04:11 +00002from test import test_support
3import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00004import codecs
Walter Dörwald9ae019b2006-03-18 14:22:26 +00005import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Walter Dörwald69652032004-09-07 20:24:22 +00007class Queue(object):
8 """
9 queue: write bytes at one end, read bytes from the other end
10 """
11 def __init__(self):
12 self._buffer = ""
13
14 def write(self, chars):
15 self._buffer += chars
16
17 def read(self, size=-1):
18 if size<0:
19 s = self._buffer
20 self._buffer = ""
21 return s
22 else:
23 s = self._buffer[:size]
24 self._buffer = self._buffer[size:]
25 return s
26
Walter Dörwalde57d7b12004-12-21 22:24:00 +000027class ReadTest(unittest.TestCase):
28 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000029 # get a StreamReader for the encoding and feed the bytestring version
30 # of input to the reader byte by byte. Read every available from
31 # the StreamReader and check that the results equal the appropriate
32 # entries from partialresults.
33 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000034 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000035 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000036 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000037 q.write(c)
38 result += r.read()
39 self.assertEqual(result, partialresult)
40 # check that there's nothing left in the buffers
41 self.assertEqual(r.read(), u"")
42 self.assertEqual(r.bytebuffer, "")
43 self.assertEqual(r.charbuffer, u"")
44
Walter Dörwaldabb02e52006-03-15 11:35:15 +000045 # do the check again, this time using a incremental decoder
46 d = codecs.getincrementaldecoder(self.encoding)()
47 result = u""
48 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
49 result += d.decode(c)
50 self.assertEqual(result, partialresult)
51 # check that there's nothing left in the buffers
52 self.assertEqual(d.decode("", True), u"")
53 self.assertEqual(d.buffer, "")
54
55 # Check whether the rest method works properly
56 d.reset()
57 result = u""
58 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
59 result += d.decode(c)
60 self.assertEqual(result, partialresult)
61 # check that there's nothing left in the buffers
62 self.assertEqual(d.decode("", True), u"")
63 self.assertEqual(d.buffer, "")
64
65 # check iterdecode()
66 encoded = input.encode(self.encoding)
67 self.assertEqual(
68 input,
69 u"".join(codecs.iterdecode(encoded, self.encoding))
70 )
71
Walter Dörwalde57d7b12004-12-21 22:24:00 +000072 def test_readline(self):
73 def getreader(input):
74 stream = StringIO.StringIO(input.encode(self.encoding))
75 return codecs.getreader(self.encoding)(stream)
76
Walter Dörwaldca199432006-03-06 22:39:12 +000077 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000078 reader = getreader(input)
79 lines = []
80 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000081 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000082 if not line:
83 break
84 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000085 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000086
87 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000088 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
89 sexpectednoends = u"foo|bar|baz|spam|eggs"
90 self.assertEqual(readalllines(s, True), sexpected)
91 self.assertEqual(readalllines(s, False), sexpectednoends)
92 self.assertEqual(readalllines(s, True, 10), sexpected)
93 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000094
95 # Test long lines (multiple calls to read() in readline())
96 vw = []
97 vwo = []
98 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
99 vw.append((i*200)*u"\3042" + lineend)
100 vwo.append((i*200)*u"\3042")
101 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
102 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
103
104 # Test lines where the first read might end with \r, so the
105 # reader has to look ahead whether this is a lone \r or a \r\n
106 for size in xrange(80):
107 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000108 s = 10*(size*u"a" + lineend + u"xxx\n")
109 reader = getreader(s)
110 for i in xrange(10):
111 self.assertEqual(
112 reader.readline(keepends=True),
113 size*u"a" + lineend,
114 )
115 reader = getreader(s)
116 for i in xrange(10):
117 self.assertEqual(
118 reader.readline(keepends=False),
119 size*u"a",
120 )
121
122 def test_bug1175396(self):
123 s = [
124 '<%!--===================================================\r\n',
125 ' BLOG index page: show recent articles,\r\n',
126 ' today\'s articles, or articles of a specific date.\r\n',
127 '========================================================--%>\r\n',
128 '<%@inputencoding="ISO-8859-1"%>\r\n',
129 '<%@pagetemplate=TEMPLATE.y%>\r\n',
130 '<%@import=import frog.util, frog%>\r\n',
131 '<%@import=import frog.objects%>\r\n',
132 '<%@import=from frog.storageerrors import StorageError%>\r\n',
133 '<%\r\n',
134 '\r\n',
135 'import logging\r\n',
136 'log=logging.getLogger("Snakelets.logger")\r\n',
137 '\r\n',
138 '\r\n',
139 'user=self.SessionCtx.user\r\n',
140 'storageEngine=self.SessionCtx.storageEngine\r\n',
141 '\r\n',
142 '\r\n',
143 'def readArticlesFromDate(date, count=None):\r\n',
144 ' entryids=storageEngine.listBlogEntries(date)\r\n',
145 ' entryids.reverse() # descending\r\n',
146 ' if count:\r\n',
147 ' entryids=entryids[:count]\r\n',
148 ' try:\r\n',
149 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
150 ' except StorageError,x:\r\n',
151 ' log.error("Error loading articles: "+str(x))\r\n',
152 ' self.abort("cannot load articles")\r\n',
153 '\r\n',
154 'showdate=None\r\n',
155 '\r\n',
156 'arg=self.Request.getArg()\r\n',
157 'if arg=="today":\r\n',
158 ' #-------------------- TODAY\'S ARTICLES\r\n',
159 ' self.write("<h2>Today\'s articles</h2>")\r\n',
160 ' showdate = frog.util.isodatestr() \r\n',
161 ' entries = readArticlesFromDate(showdate)\r\n',
162 'elif arg=="active":\r\n',
163 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
164 ' self.Yredirect("active.y")\r\n',
165 'elif arg=="login":\r\n',
166 ' #-------------------- LOGIN PAGE redirect\r\n',
167 ' self.Yredirect("login.y")\r\n',
168 'elif arg=="date":\r\n',
169 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
170 ' showdate = self.Request.getParameter("date")\r\n',
171 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
172 ' entries = readArticlesFromDate(showdate)\r\n',
173 'else:\r\n',
174 ' #-------------------- RECENT ARTICLES\r\n',
175 ' self.write("<h2>Recent articles</h2>")\r\n',
176 ' dates=storageEngine.listBlogEntryDates()\r\n',
177 ' if dates:\r\n',
178 ' entries=[]\r\n',
179 ' SHOWAMOUNT=10\r\n',
180 ' for showdate in dates:\r\n',
181 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
182 ' if len(entries)>=SHOWAMOUNT:\r\n',
183 ' break\r\n',
184 ' \r\n',
185 ]
186 stream = StringIO.StringIO("".join(s).encode(self.encoding))
187 reader = codecs.getreader(self.encoding)(stream)
188 for (i, line) in enumerate(reader):
189 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000190
191 def test_readlinequeue(self):
192 q = Queue()
193 writer = codecs.getwriter(self.encoding)(q)
194 reader = codecs.getreader(self.encoding)(q)
195
196 # No lineends
197 writer.write(u"foo\r")
198 self.assertEqual(reader.readline(keepends=False), u"foo")
199 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000200 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000201 self.assertEqual(reader.readline(keepends=False), u"bar")
202 writer.write(u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"baz")
204 self.assertEqual(reader.readline(keepends=False), u"")
205
206 # Lineends
207 writer.write(u"foo\r")
208 self.assertEqual(reader.readline(keepends=True), u"foo\r")
209 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000210 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000211 self.assertEqual(reader.readline(keepends=True), u"bar\r")
212 writer.write(u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"baz")
214 self.assertEqual(reader.readline(keepends=True), u"")
215 writer.write(u"foo\r\n")
216 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
217
Walter Dörwald9fa09462005-01-10 12:01:39 +0000218 def test_bug1098990_a(self):
219 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
220 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
221 s3 = u"next line.\r\n"
222
223 s = (s1+s2+s3).encode(self.encoding)
224 stream = StringIO.StringIO(s)
225 reader = codecs.getreader(self.encoding)(stream)
226 self.assertEqual(reader.readline(), s1)
227 self.assertEqual(reader.readline(), s2)
228 self.assertEqual(reader.readline(), s3)
229 self.assertEqual(reader.readline(), u"")
230
231 def test_bug1098990_b(self):
232 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
233 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
234 s3 = u"stillokay:bbbbxx\r\n"
235 s4 = u"broken!!!!badbad\r\n"
236 s5 = u"againokay.\r\n"
237
238 s = (s1+s2+s3+s4+s5).encode(self.encoding)
239 stream = StringIO.StringIO(s)
240 reader = codecs.getreader(self.encoding)(stream)
241 self.assertEqual(reader.readline(), s1)
242 self.assertEqual(reader.readline(), s2)
243 self.assertEqual(reader.readline(), s3)
244 self.assertEqual(reader.readline(), s4)
245 self.assertEqual(reader.readline(), s5)
246 self.assertEqual(reader.readline(), u"")
247
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000248class UTF16Test(ReadTest):
249 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000250
251 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
252 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
253
254 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000255 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000256 # encode some stream
257 s = StringIO.StringIO()
258 f = writer(s)
259 f.write(u"spam")
260 f.write(u"spam")
261 d = s.getvalue()
262 # check whether there is exactly one BOM in it
263 self.assert_(d == self.spamle or d == self.spambe)
264 # try to read it back
265 s = StringIO.StringIO(d)
266 f = reader(s)
267 self.assertEquals(f.read(), u"spamspam")
268
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000269 def test_badbom(self):
270 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000271 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000272 self.assertRaises(UnicodeError, f.read)
273
274 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000275 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000276 self.assertRaises(UnicodeError, f.read)
277
Walter Dörwald69652032004-09-07 20:24:22 +0000278 def test_partial(self):
279 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000280 u"\x00\xff\u0100\uffff",
281 [
282 u"", # first byte of BOM read
283 u"", # second byte of BOM read => byteorder known
284 u"",
285 u"\x00",
286 u"\x00",
287 u"\x00\xff",
288 u"\x00\xff",
289 u"\x00\xff\u0100",
290 u"\x00\xff\u0100",
291 u"\x00\xff\u0100\uffff",
292 ]
293 )
294
Walter Dörwalde22d3392005-11-17 08:52:34 +0000295 def test_errors(self):
296 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
297
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000298class UTF16LETest(ReadTest):
299 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000300
301 def test_partial(self):
302 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000303 u"\x00\xff\u0100\uffff",
304 [
305 u"",
306 u"\x00",
307 u"\x00",
308 u"\x00\xff",
309 u"\x00\xff",
310 u"\x00\xff\u0100",
311 u"\x00\xff\u0100",
312 u"\x00\xff\u0100\uffff",
313 ]
314 )
315
Walter Dörwalde22d3392005-11-17 08:52:34 +0000316 def test_errors(self):
317 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
318
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000319class UTF16BETest(ReadTest):
320 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000321
322 def test_partial(self):
323 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000324 u"\x00\xff\u0100\uffff",
325 [
326 u"",
327 u"\x00",
328 u"\x00",
329 u"\x00\xff",
330 u"\x00\xff",
331 u"\x00\xff\u0100",
332 u"\x00\xff\u0100",
333 u"\x00\xff\u0100\uffff",
334 ]
335 )
336
Walter Dörwalde22d3392005-11-17 08:52:34 +0000337 def test_errors(self):
338 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
339
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000340class UTF8Test(ReadTest):
341 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000342
343 def test_partial(self):
344 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000345 u"\x00\xff\u07ff\u0800\uffff",
346 [
347 u"\x00",
348 u"\x00",
349 u"\x00\xff",
350 u"\x00\xff",
351 u"\x00\xff\u07ff",
352 u"\x00\xff\u07ff",
353 u"\x00\xff\u07ff",
354 u"\x00\xff\u07ff\u0800",
355 u"\x00\xff\u07ff\u0800",
356 u"\x00\xff\u07ff\u0800",
357 u"\x00\xff\u07ff\u0800\uffff",
358 ]
359 )
360
Walter Dörwalde22d3392005-11-17 08:52:34 +0000361class UTF7Test(ReadTest):
362 encoding = "utf-7"
363
364 # No test_partial() yet, because UTF-7 doesn't support it.
365
366class UTF16ExTest(unittest.TestCase):
367
368 def test_errors(self):
369 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
370
371 def test_bad_args(self):
372 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
373
374class ReadBufferTest(unittest.TestCase):
375
376 def test_array(self):
377 import array
378 self.assertEqual(
379 codecs.readbuffer_encode(array.array("c", "spam")),
380 ("spam", 4)
381 )
382
383 def test_empty(self):
384 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
385
386 def test_bad_args(self):
387 self.assertRaises(TypeError, codecs.readbuffer_encode)
388 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
389
390class CharBufferTest(unittest.TestCase):
391
392 def test_string(self):
393 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
394
395 def test_empty(self):
396 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
397
398 def test_bad_args(self):
399 self.assertRaises(TypeError, codecs.charbuffer_encode)
400 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
401
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000402class UTF8SigTest(ReadTest):
403 encoding = "utf-8-sig"
404
405 def test_partial(self):
406 self.check_partial(
407 u"\ufeff\x00\xff\u07ff\u0800\uffff",
408 [
409 u"",
410 u"",
411 u"", # First BOM has been read and skipped
412 u"",
413 u"",
414 u"\ufeff", # Second BOM has been read and emitted
415 u"\ufeff\x00", # "\x00" read and emitted
416 u"\ufeff\x00", # First byte of encoded u"\xff" read
417 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
418 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
419 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
420 u"\ufeff\x00\xff\u07ff",
421 u"\ufeff\x00\xff\u07ff",
422 u"\ufeff\x00\xff\u07ff\u0800",
423 u"\ufeff\x00\xff\u07ff\u0800",
424 u"\ufeff\x00\xff\u07ff\u0800",
425 u"\ufeff\x00\xff\u07ff\u0800\uffff",
426 ]
427 )
428
Walter Dörwald8709a422002-09-03 13:53:40 +0000429class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000430 def test_empty(self):
Walter Dörwald8709a422002-09-03 13:53:40 +0000431 self.assertEquals(codecs.escape_decode(""), ("", 0))
432
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000433class RecodingTest(unittest.TestCase):
434 def test_recoding(self):
435 f = StringIO.StringIO()
436 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
437 f2.write(u"a")
438 f2.close()
439 # Python used to crash on this at exit because of a refcount
440 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000441
Martin v. Löwis2548c732003-04-18 10:39:54 +0000442# From RFC 3492
443punycode_testcases = [
444 # A Arabic (Egyptian):
445 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
446 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
447 "egbpdaj6bu4bxfgehfvwxn"),
448 # B Chinese (simplified):
449 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
450 "ihqwcrb4cv8a8dqg056pqjye"),
451 # C Chinese (traditional):
452 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
453 "ihqwctvzc91f659drss3x8bo0yb"),
454 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
455 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
456 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
457 u"\u0065\u0073\u006B\u0079",
458 "Proprostnemluvesky-uyb24dma41a"),
459 # E Hebrew:
460 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
461 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
462 u"\u05D1\u05E8\u05D9\u05EA",
463 "4dbcagdahymbxekheh6e0a7fei0b"),
464 # F Hindi (Devanagari):
465 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
466 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
467 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
468 u"\u0939\u0948\u0902",
469 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
470
471 #(G) Japanese (kanji and hiragana):
472 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
473 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
474 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
475
476 # (H) Korean (Hangul syllables):
477 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
478 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
479 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
480 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
481 "psd879ccm6fea98c"),
482
483 # (I) Russian (Cyrillic):
484 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
485 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
486 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
487 u"\u0438",
488 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
489
490 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
491 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
492 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
493 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
494 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
495 u"\u0061\u00F1\u006F\u006C",
496 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
497
498 # (K) Vietnamese:
499 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
500 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
501 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
502 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
503 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
504 u"\u0056\u0069\u1EC7\u0074",
505 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
506
Martin v. Löwis2548c732003-04-18 10:39:54 +0000507 #(L) 3<nen>B<gumi><kinpachi><sensei>
508 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
509 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000510
Martin v. Löwis2548c732003-04-18 10:39:54 +0000511 # (M) <amuro><namie>-with-SUPER-MONKEYS
512 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
513 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
514 u"\u004F\u004E\u004B\u0045\u0059\u0053",
515 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
516
517 # (N) Hello-Another-Way-<sorezore><no><basho>
518 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
519 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
520 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
521 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
522
523 # (O) <hitotsu><yane><no><shita>2
524 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
525 "2-u9tlzr9756bt3uc0v"),
526
527 # (P) Maji<de>Koi<suru>5<byou><mae>
528 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
529 u"\u308B\u0035\u79D2\u524D",
530 "MajiKoi5-783gue6qz075azm5e"),
531
532 # (Q) <pafii>de<runba>
533 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
534 "de-jg4avhby1noc0d"),
535
536 # (R) <sono><supiido><de>
537 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
538 "d9juau41awczczp"),
539
540 # (S) -> $1.00 <-
541 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
542 u"\u003C\u002D",
543 "-> $1.00 <--")
544 ]
545
546for i in punycode_testcases:
547 if len(i)!=2:
548 print repr(i)
549
550class PunycodeTest(unittest.TestCase):
551 def test_encode(self):
552 for uni, puny in punycode_testcases:
553 # Need to convert both strings to lower case, since
554 # some of the extended encodings use upper case, but our
555 # code produces only lower case. Converting just puny to
556 # lower is also insufficient, since some of the input characters
557 # are upper case.
558 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
559
560 def test_decode(self):
561 for uni, puny in punycode_testcases:
562 self.assertEquals(uni, puny.decode("punycode"))
563
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000564class UnicodeInternalTest(unittest.TestCase):
565 def test_bug1251300(self):
566 # Decoding with unicode_internal used to not correctly handle "code
567 # points" above 0x10ffff on UCS-4 builds.
568 if sys.maxunicode > 0xffff:
569 ok = [
570 ("\x00\x10\xff\xff", u"\U0010ffff"),
571 ("\x00\x00\x01\x01", u"\U00000101"),
572 ("", u""),
573 ]
574 not_ok = [
575 "\x7f\xff\xff\xff",
576 "\x80\x00\x00\x00",
577 "\x81\x00\x00\x00",
578 "\x00",
579 "\x00\x00\x00\x00\x00",
580 ]
581 for internal, uni in ok:
582 if sys.byteorder == "little":
583 internal = "".join(reversed(internal))
584 self.assertEquals(uni, internal.decode("unicode_internal"))
585 for internal in not_ok:
586 if sys.byteorder == "little":
587 internal = "".join(reversed(internal))
588 self.assertRaises(UnicodeDecodeError, internal.decode,
589 "unicode_internal")
590
591 def test_decode_error_attributes(self):
592 if sys.maxunicode > 0xffff:
593 try:
594 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
595 except UnicodeDecodeError, ex:
596 self.assertEquals("unicode_internal", ex.encoding)
597 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
598 self.assertEquals(4, ex.start)
599 self.assertEquals(8, ex.end)
600 else:
601 self.fail()
602
603 def test_decode_callback(self):
604 if sys.maxunicode > 0xffff:
605 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
606 decoder = codecs.getdecoder("unicode_internal")
607 ab = u"ab".encode("unicode_internal")
608 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
609 "UnicodeInternalTest")
610 self.assertEquals((u"ab", 12), ignored)
611
Martin v. Löwis2548c732003-04-18 10:39:54 +0000612# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
613nameprep_tests = [
614 # 3.1 Map to nothing.
615 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
616 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
617 '\xb8\x8f\xef\xbb\xbf',
618 'foobarbaz'),
619 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
620 ('CAFE',
621 'cafe'),
622 # 3.3 Case folding 8bit U+00DF (german sharp s).
623 # The original test case is bogus; it says \xc3\xdf
624 ('\xc3\x9f',
625 'ss'),
626 # 3.4 Case folding U+0130 (turkish capital I with dot).
627 ('\xc4\xb0',
628 'i\xcc\x87'),
629 # 3.5 Case folding multibyte U+0143 U+037A.
630 ('\xc5\x83\xcd\xba',
631 '\xc5\x84 \xce\xb9'),
632 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
633 # XXX: skip this as it fails in UCS-2 mode
634 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
635 # 'telc\xe2\x88\x95kg\xcf\x83'),
636 (None, None),
637 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
638 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
639 '\xc7\xb0 a'),
640 # 3.8 Case folding U+1FB7 and normalization.
641 ('\xe1\xbe\xb7',
642 '\xe1\xbe\xb6\xce\xb9'),
643 # 3.9 Self-reverting case folding U+01F0 and normalization.
644 # The original test case is bogus, it says `\xc7\xf0'
645 ('\xc7\xb0',
646 '\xc7\xb0'),
647 # 3.10 Self-reverting case folding U+0390 and normalization.
648 ('\xce\x90',
649 '\xce\x90'),
650 # 3.11 Self-reverting case folding U+03B0 and normalization.
651 ('\xce\xb0',
652 '\xce\xb0'),
653 # 3.12 Self-reverting case folding U+1E96 and normalization.
654 ('\xe1\xba\x96',
655 '\xe1\xba\x96'),
656 # 3.13 Self-reverting case folding U+1F56 and normalization.
657 ('\xe1\xbd\x96',
658 '\xe1\xbd\x96'),
659 # 3.14 ASCII space character U+0020.
660 (' ',
661 ' '),
662 # 3.15 Non-ASCII 8bit space character U+00A0.
663 ('\xc2\xa0',
664 ' '),
665 # 3.16 Non-ASCII multibyte space character U+1680.
666 ('\xe1\x9a\x80',
667 None),
668 # 3.17 Non-ASCII multibyte space character U+2000.
669 ('\xe2\x80\x80',
670 ' '),
671 # 3.18 Zero Width Space U+200b.
672 ('\xe2\x80\x8b',
673 ''),
674 # 3.19 Non-ASCII multibyte space character U+3000.
675 ('\xe3\x80\x80',
676 ' '),
677 # 3.20 ASCII control characters U+0010 U+007F.
678 ('\x10\x7f',
679 '\x10\x7f'),
680 # 3.21 Non-ASCII 8bit control character U+0085.
681 ('\xc2\x85',
682 None),
683 # 3.22 Non-ASCII multibyte control character U+180E.
684 ('\xe1\xa0\x8e',
685 None),
686 # 3.23 Zero Width No-Break Space U+FEFF.
687 ('\xef\xbb\xbf',
688 ''),
689 # 3.24 Non-ASCII control character U+1D175.
690 ('\xf0\x9d\x85\xb5',
691 None),
692 # 3.25 Plane 0 private use character U+F123.
693 ('\xef\x84\xa3',
694 None),
695 # 3.26 Plane 15 private use character U+F1234.
696 ('\xf3\xb1\x88\xb4',
697 None),
698 # 3.27 Plane 16 private use character U+10F234.
699 ('\xf4\x8f\x88\xb4',
700 None),
701 # 3.28 Non-character code point U+8FFFE.
702 ('\xf2\x8f\xbf\xbe',
703 None),
704 # 3.29 Non-character code point U+10FFFF.
705 ('\xf4\x8f\xbf\xbf',
706 None),
707 # 3.30 Surrogate code U+DF42.
708 ('\xed\xbd\x82',
709 None),
710 # 3.31 Non-plain text character U+FFFD.
711 ('\xef\xbf\xbd',
712 None),
713 # 3.32 Ideographic description character U+2FF5.
714 ('\xe2\xbf\xb5',
715 None),
716 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000717 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000718 '\xcc\x81'),
719 # 3.34 Left-to-right mark U+200E.
720 ('\xe2\x80\x8e',
721 None),
722 # 3.35 Deprecated U+202A.
723 ('\xe2\x80\xaa',
724 None),
725 # 3.36 Language tagging character U+E0001.
726 ('\xf3\xa0\x80\x81',
727 None),
728 # 3.37 Language tagging character U+E0042.
729 ('\xf3\xa0\x81\x82',
730 None),
731 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
732 ('foo\xd6\xbebar',
733 None),
734 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
735 ('foo\xef\xb5\x90bar',
736 None),
737 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
738 ('foo\xef\xb9\xb6bar',
739 'foo \xd9\x8ebar'),
740 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
741 ('\xd8\xa71',
742 None),
743 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
744 ('\xd8\xa71\xd8\xa8',
745 '\xd8\xa71\xd8\xa8'),
746 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000747 # Skip this test as we allow unassigned
748 #('\xf3\xa0\x80\x82',
749 # None),
750 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000751 # 3.44 Larger test (shrinking).
752 # Original test case reads \xc3\xdf
753 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
754 '\xaa\xce\xb0\xe2\x80\x80',
755 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
756 # 3.45 Larger test (expanding).
757 # Original test case reads \xc3\x9f
758 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
759 '\x80',
760 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
761 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
762 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
763 ]
764
765
766class NameprepTest(unittest.TestCase):
767 def test_nameprep(self):
768 from encodings.idna import nameprep
769 for pos, (orig, prepped) in enumerate(nameprep_tests):
770 if orig is None:
771 # Skipped
772 continue
773 # The Unicode strings are given in UTF-8
774 orig = unicode(orig, "utf-8")
775 if prepped is None:
776 # Input contains prohibited characters
777 self.assertRaises(UnicodeError, nameprep, orig)
778 else:
779 prepped = unicode(prepped, "utf-8")
780 try:
781 self.assertEquals(nameprep(orig), prepped)
782 except Exception,e:
783 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
784
Walter Dörwald78a0be62006-04-14 18:25:39 +0000785class IDNACodecTest(unittest.TestCase):
786 def test_builtin_decode(self):
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000787 self.assertEquals(unicode("python.org", "idna"), u"python.org")
Walter Dörwald78a0be62006-04-14 18:25:39 +0000788 self.assertEquals(unicode("python.org.", "idna"), u"python.org.")
789 self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
790 self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
791
792 def test_builtin_encode(self):
793 self.assertEquals(u"python.org".encode("idna"), "python.org")
794 self.assertEquals("python.org.".encode("idna"), "python.org.")
795 self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
796 self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000797
Martin v. Löwis8b595142005-08-25 11:03:38 +0000798 def test_stream(self):
799 import StringIO
800 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
801 r.read(3)
802 self.assertEquals(r.read(), u"")
803
Walter Dörwald78a0be62006-04-14 18:25:39 +0000804 def test_incremental_decode(self):
805 self.assertEquals(
806 "".join(codecs.iterdecode("python.org", "idna")),
807 u"python.org"
808 )
809 self.assertEquals(
810 "".join(codecs.iterdecode("python.org.", "idna")),
811 u"python.org."
812 )
813 self.assertEquals(
814 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
815 u"pyth\xf6n.org."
816 )
817 self.assertEquals(
818 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
819 u"pyth\xf6n.org."
820 )
821
822 decoder = codecs.getincrementaldecoder("idna")()
823 self.assertEquals(decoder.decode("xn--xam", ), u"")
824 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
825 self.assertEquals(decoder.decode(u"rg"), u"")
826 self.assertEquals(decoder.decode(u"", True), u"org")
827
828 decoder.reset()
829 self.assertEquals(decoder.decode("xn--xam", ), u"")
830 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
831 self.assertEquals(decoder.decode("rg."), u"org.")
832 self.assertEquals(decoder.decode("", True), u"")
833
834 def test_incremental_encode(self):
835 self.assertEquals(
836 "".join(codecs.iterencode(u"python.org", "idna")),
837 "python.org"
838 )
839 self.assertEquals(
840 "".join(codecs.iterencode(u"python.org.", "idna")),
841 "python.org."
842 )
843 self.assertEquals(
844 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
845 "xn--pythn-mua.org."
846 )
847 self.assertEquals(
848 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
849 "xn--pythn-mua.org."
850 )
851
852 encoder = codecs.getincrementalencoder("idna")()
853 self.assertEquals(encoder.encode(u"\xe4x"), "")
854 self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
855 self.assertEquals(encoder.encode(u"", True), "org")
856
857 encoder.reset()
858 self.assertEquals(encoder.encode(u"\xe4x"), "")
859 self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
860 self.assertEquals(encoder.encode(u"", True), "")
861
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000862class CodecsModuleTest(unittest.TestCase):
863
864 def test_decode(self):
865 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
866 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000867 self.assertRaises(TypeError, codecs.decode)
868 self.assertEquals(codecs.decode('abc'), u'abc')
869 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
870
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000871 def test_encode(self):
872 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
873 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000874 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +0000875 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwald063e1e82004-10-28 13:04:26 +0000876 self.assertEquals(codecs.encode(u'abc'), 'abc')
877 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
878
879 def test_register(self):
880 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +0000881 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +0000882
883 def test_lookup(self):
884 self.assertRaises(TypeError, codecs.lookup)
885 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +0000886 self.assertRaises(LookupError, codecs.lookup, " ")
887
888 def test_getencoder(self):
889 self.assertRaises(TypeError, codecs.getencoder)
890 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
891
892 def test_getdecoder(self):
893 self.assertRaises(TypeError, codecs.getdecoder)
894 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
895
896 def test_getreader(self):
897 self.assertRaises(TypeError, codecs.getreader)
898 self.assertRaises(LookupError, codecs.getreader, "__spam__")
899
900 def test_getwriter(self):
901 self.assertRaises(TypeError, codecs.getwriter)
902 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000903
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000904class StreamReaderTest(unittest.TestCase):
905
906 def setUp(self):
907 self.reader = codecs.getreader('utf-8')
908 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
909
910 def test_readlines(self):
911 f = self.reader(self.stream)
912 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
913
Georg Brandl2a5a3022006-10-29 08:39:27 +0000914class EncodedFileTest(unittest.TestCase):
915
916 def test_basic(self):
917 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
918 ef = codecs.EncodedFile(f, 'utf-16', 'utf-8')
919 self.assertEquals(ef.read(), '\xff\xfe\\\xd5\n\x00\x00\xae')
920
921 f = StringIO.StringIO()
922 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
923 ef.write('\xc3\xbc')
924 self.assertEquals(f.getvalue(), '\xfc')
925
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000926class Str2StrTest(unittest.TestCase):
927
928 def test_read(self):
929 sin = "\x80".encode("base64_codec")
930 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
931 sout = reader.read()
932 self.assertEqual(sout, "\x80")
933 self.assert_(isinstance(sout, str))
934
935 def test_readline(self):
936 sin = "\x80".encode("base64_codec")
937 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
938 sout = reader.readline()
939 self.assertEqual(sout, "\x80")
940 self.assert_(isinstance(sout, str))
941
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000942all_unicode_encodings = [
943 "ascii",
944 "base64_codec",
945 "big5",
946 "big5hkscs",
947 "charmap",
948 "cp037",
949 "cp1006",
950 "cp1026",
951 "cp1140",
952 "cp1250",
953 "cp1251",
954 "cp1252",
955 "cp1253",
956 "cp1254",
957 "cp1255",
958 "cp1256",
959 "cp1257",
960 "cp1258",
961 "cp424",
962 "cp437",
963 "cp500",
964 "cp737",
965 "cp775",
966 "cp850",
967 "cp852",
968 "cp855",
969 "cp856",
970 "cp857",
971 "cp860",
972 "cp861",
973 "cp862",
974 "cp863",
975 "cp864",
976 "cp865",
977 "cp866",
978 "cp869",
979 "cp874",
980 "cp875",
981 "cp932",
982 "cp949",
983 "cp950",
984 "euc_jis_2004",
985 "euc_jisx0213",
986 "euc_jp",
987 "euc_kr",
988 "gb18030",
989 "gb2312",
990 "gbk",
991 "hex_codec",
992 "hp_roman8",
993 "hz",
994 "idna",
995 "iso2022_jp",
996 "iso2022_jp_1",
997 "iso2022_jp_2",
998 "iso2022_jp_2004",
999 "iso2022_jp_3",
1000 "iso2022_jp_ext",
1001 "iso2022_kr",
1002 "iso8859_1",
1003 "iso8859_10",
1004 "iso8859_11",
1005 "iso8859_13",
1006 "iso8859_14",
1007 "iso8859_15",
1008 "iso8859_16",
1009 "iso8859_2",
1010 "iso8859_3",
1011 "iso8859_4",
1012 "iso8859_5",
1013 "iso8859_6",
1014 "iso8859_7",
1015 "iso8859_8",
1016 "iso8859_9",
1017 "johab",
1018 "koi8_r",
1019 "koi8_u",
1020 "latin_1",
1021 "mac_cyrillic",
1022 "mac_greek",
1023 "mac_iceland",
1024 "mac_latin2",
1025 "mac_roman",
1026 "mac_turkish",
1027 "palmos",
1028 "ptcp154",
1029 "punycode",
1030 "raw_unicode_escape",
1031 "rot_13",
1032 "shift_jis",
1033 "shift_jis_2004",
1034 "shift_jisx0213",
1035 "tis_620",
1036 "unicode_escape",
1037 "unicode_internal",
1038 "utf_16",
1039 "utf_16_be",
1040 "utf_16_le",
1041 "utf_7",
1042 "utf_8",
1043]
1044
1045if hasattr(codecs, "mbcs_encode"):
1046 all_unicode_encodings.append("mbcs")
1047
1048# The following encodings work only with str, not unicode
1049all_string_encodings = [
1050 "quopri_codec",
1051 "string_escape",
1052 "uu_codec",
1053]
1054
1055# The following encoding is not tested, because it's not supposed
1056# to work:
1057# "undefined"
1058
1059# The following encodings don't work in stateful mode
1060broken_unicode_with_streams = [
1061 "base64_codec",
1062 "hex_codec",
1063 "punycode",
1064 "unicode_internal"
1065]
1066
1067try:
1068 import bz2
1069except ImportError:
1070 pass
1071else:
1072 all_unicode_encodings.append("bz2_codec")
1073 broken_unicode_with_streams.append("bz2_codec")
1074
1075try:
1076 import zlib
1077except ImportError:
1078 pass
1079else:
1080 all_unicode_encodings.append("zlib_codec")
1081 broken_unicode_with_streams.append("zlib_codec")
1082
1083class BasicUnicodeTest(unittest.TestCase):
1084 def test_basics(self):
1085 s = u"abc123" # all codecs should be able to encode these
1086 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001087 name = codecs.lookup(encoding).name
1088 if encoding.endswith("_codec"):
1089 name += "_codec"
1090 elif encoding == "latin_1":
1091 name = "latin_1"
1092 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001093 (bytes, size) = codecs.getencoder(encoding)(s)
1094 if encoding != "unicode_internal":
1095 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1096 (chars, size) = codecs.getdecoder(encoding)(bytes)
1097 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1098
1099 if encoding not in broken_unicode_with_streams:
1100 # check stream reader/writer
1101 q = Queue()
1102 writer = codecs.getwriter(encoding)(q)
1103 encodedresult = ""
1104 for c in s:
1105 writer.write(c)
1106 encodedresult += q.read()
1107 q = Queue()
1108 reader = codecs.getreader(encoding)(q)
1109 decodedresult = u""
1110 for c in encodedresult:
1111 q.write(c)
1112 decodedresult += reader.read()
1113 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1114
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001115 # check incremental decoder/encoder (fetched via the Python
1116 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001117 try:
1118 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001119 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001120 except LookupError: # no IncrementalEncoder
1121 pass
1122 else:
1123 # check incremental decoder/encoder
1124 encodedresult = ""
1125 for c in s:
1126 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001127 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001128 decoder = codecs.getincrementaldecoder(encoding)()
1129 decodedresult = u""
1130 for c in encodedresult:
1131 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001132 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001133 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1134
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001135 # check C API
1136 encodedresult = ""
1137 for c in s:
1138 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001139 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001140 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1141 decodedresult = u""
1142 for c in encodedresult:
1143 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001144 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001145 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1146
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001147 # check iterencode()/iterdecode()
1148 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1149 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1150
1151 # check iterencode()/iterdecode() with empty string
1152 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1153 self.assertEqual(result, u"")
1154
Walter Dörwald729c31f2005-03-14 19:06:30 +00001155 def test_seek(self):
1156 # all codecs should be able to encode these
1157 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1158 for encoding in all_unicode_encodings:
1159 if encoding == "idna": # FIXME: See SF bug #1163178
1160 continue
1161 if encoding in broken_unicode_with_streams:
1162 continue
1163 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1164 for t in xrange(5):
1165 # Test that calling seek resets the internal codec state and buffers
1166 reader.seek(0, 0)
1167 line = reader.readline()
1168 self.assertEqual(s[:len(line)], line)
1169
Walter Dörwalde22d3392005-11-17 08:52:34 +00001170 def test_bad_decode_args(self):
1171 for encoding in all_unicode_encodings:
1172 decoder = codecs.getdecoder(encoding)
1173 self.assertRaises(TypeError, decoder)
1174 if encoding not in ("idna", "punycode"):
1175 self.assertRaises(TypeError, decoder, 42)
1176
1177 def test_bad_encode_args(self):
1178 for encoding in all_unicode_encodings:
1179 encoder = codecs.getencoder(encoding)
1180 self.assertRaises(TypeError, encoder)
1181
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001182 def test_encoding_map_type_initialized(self):
1183 from encodings import cp1140
1184 # This used to crash, we are only verifying there's no crash.
1185 table_type = type(cp1140.encoding_table)
1186 self.assertEqual(table_type, table_type)
1187
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001188class BasicStrTest(unittest.TestCase):
1189 def test_basics(self):
1190 s = "abc123"
1191 for encoding in all_string_encodings:
1192 (bytes, size) = codecs.getencoder(encoding)(s)
1193 self.assertEqual(size, len(s))
1194 (chars, size) = codecs.getdecoder(encoding)(bytes)
1195 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1196
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001197class CharmapTest(unittest.TestCase):
1198 def test_decode_with_string_map(self):
1199 self.assertEquals(
1200 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1201 (u"abc", 3)
1202 )
1203
1204 self.assertEquals(
1205 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1206 (u"ab\ufffd", 3)
1207 )
1208
1209 self.assertEquals(
1210 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1211 (u"ab\ufffd", 3)
1212 )
1213
1214 self.assertEquals(
1215 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1216 (u"ab", 3)
1217 )
1218
1219 self.assertEquals(
1220 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1221 (u"ab", 3)
1222 )
1223
1224 allbytes = "".join(chr(i) for i in xrange(256))
1225 self.assertEquals(
1226 codecs.charmap_decode(allbytes, "ignore", u""),
1227 (u"", len(allbytes))
1228 )
1229
Georg Brandl2a5a3022006-10-29 08:39:27 +00001230class WithStmtTest(unittest.TestCase):
1231 def test_encodedfile(self):
1232 f = StringIO.StringIO("\xc3\xbc")
1233 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1234 self.assertEquals(ef.read(), "\xfc")
1235
1236 def test_streamreaderwriter(self):
1237 f = StringIO.StringIO("\xc3\xbc")
1238 info = codecs.lookup("utf-8")
1239 with codecs.StreamReaderWriter(f, info.streamreader,
1240 info.streamwriter, 'strict') as srw:
1241 self.assertEquals(srw.read(), u"\xfc")
1242
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001243
Fred Drake2e2be372001-09-20 21:33:42 +00001244def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001245 test_support.run_unittest(
1246 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001247 UTF16LETest,
1248 UTF16BETest,
1249 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001250 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001251 UTF7Test,
1252 UTF16ExTest,
1253 ReadBufferTest,
1254 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001255 EscapeDecodeTest,
1256 RecodingTest,
1257 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001258 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001259 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00001260 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001261 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001262 StreamReaderTest,
Georg Brandl2a5a3022006-10-29 08:39:27 +00001263 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001264 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001265 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001266 BasicStrTest,
Georg Brandl2a5a3022006-10-29 08:39:27 +00001267 CharmapTest,
1268 WithStmtTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001269 )
Fred Drake2e2be372001-09-20 21:33:42 +00001270
1271
1272if __name__ == "__main__":
1273 test_main()