blob: 3c800f87771164722672ea83bdfa4633ca9cfa46 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
12
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwalde57d7b12004-12-21 22:24:00 +000026class ReadTest(unittest.TestCase):
27 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000028 # get a StreamReader for the encoding and feed the bytestring version
29 # of input to the reader byte by byte. Read every available from
30 # the StreamReader and check that the results equal the appropriate
31 # entries from partialresults.
32 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000033 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000034 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000035 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000036 q.write(c)
37 result += r.read()
38 self.assertEqual(result, partialresult)
39 # check that there's nothing left in the buffers
40 self.assertEqual(r.read(), u"")
41 self.assertEqual(r.bytebuffer, "")
42 self.assertEqual(r.charbuffer, u"")
43
Thomas Woutersa9773292006-04-21 09:43:23 +000044 # do the check again, this time using a incremental decoder
45 d = codecs.getincrementaldecoder(self.encoding)()
46 result = u""
47 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
48 result += d.decode(c)
49 self.assertEqual(result, partialresult)
50 # check that there's nothing left in the buffers
51 self.assertEqual(d.decode("", True), u"")
52 self.assertEqual(d.buffer, "")
53
54 # Check whether the rest method works properly
55 d.reset()
56 result = u""
57 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
58 result += d.decode(c)
59 self.assertEqual(result, partialresult)
60 # check that there's nothing left in the buffers
61 self.assertEqual(d.decode("", True), u"")
62 self.assertEqual(d.buffer, "")
63
64 # check iterdecode()
65 encoded = input.encode(self.encoding)
66 self.assertEqual(
67 input,
68 u"".join(codecs.iterdecode(encoded, self.encoding))
69 )
70
Walter Dörwalde57d7b12004-12-21 22:24:00 +000071 def test_readline(self):
72 def getreader(input):
73 stream = StringIO.StringIO(input.encode(self.encoding))
74 return codecs.getreader(self.encoding)(stream)
75
Walter Dörwaldca199432006-03-06 22:39:12 +000076 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 reader = getreader(input)
78 lines = []
79 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000080 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000081 if not line:
82 break
83 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000084 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000085
86 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000087 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
88 sexpectednoends = u"foo|bar|baz|spam|eggs"
89 self.assertEqual(readalllines(s, True), sexpected)
90 self.assertEqual(readalllines(s, False), sexpectednoends)
91 self.assertEqual(readalllines(s, True, 10), sexpected)
92 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000093
94 # Test long lines (multiple calls to read() in readline())
95 vw = []
96 vwo = []
97 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
98 vw.append((i*200)*u"\3042" + lineend)
99 vwo.append((i*200)*u"\3042")
100 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
101 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
102
103 # Test lines where the first read might end with \r, so the
104 # reader has to look ahead whether this is a lone \r or a \r\n
105 for size in xrange(80):
106 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000107 s = 10*(size*u"a" + lineend + u"xxx\n")
108 reader = getreader(s)
109 for i in xrange(10):
110 self.assertEqual(
111 reader.readline(keepends=True),
112 size*u"a" + lineend,
113 )
114 reader = getreader(s)
115 for i in xrange(10):
116 self.assertEqual(
117 reader.readline(keepends=False),
118 size*u"a",
119 )
120
121 def test_bug1175396(self):
122 s = [
123 '<%!--===================================================\r\n',
124 ' BLOG index page: show recent articles,\r\n',
125 ' today\'s articles, or articles of a specific date.\r\n',
126 '========================================================--%>\r\n',
127 '<%@inputencoding="ISO-8859-1"%>\r\n',
128 '<%@pagetemplate=TEMPLATE.y%>\r\n',
129 '<%@import=import frog.util, frog%>\r\n',
130 '<%@import=import frog.objects%>\r\n',
131 '<%@import=from frog.storageerrors import StorageError%>\r\n',
132 '<%\r\n',
133 '\r\n',
134 'import logging\r\n',
135 'log=logging.getLogger("Snakelets.logger")\r\n',
136 '\r\n',
137 '\r\n',
138 'user=self.SessionCtx.user\r\n',
139 'storageEngine=self.SessionCtx.storageEngine\r\n',
140 '\r\n',
141 '\r\n',
142 'def readArticlesFromDate(date, count=None):\r\n',
143 ' entryids=storageEngine.listBlogEntries(date)\r\n',
144 ' entryids.reverse() # descending\r\n',
145 ' if count:\r\n',
146 ' entryids=entryids[:count]\r\n',
147 ' try:\r\n',
148 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
149 ' except StorageError,x:\r\n',
150 ' log.error("Error loading articles: "+str(x))\r\n',
151 ' self.abort("cannot load articles")\r\n',
152 '\r\n',
153 'showdate=None\r\n',
154 '\r\n',
155 'arg=self.Request.getArg()\r\n',
156 'if arg=="today":\r\n',
157 ' #-------------------- TODAY\'S ARTICLES\r\n',
158 ' self.write("<h2>Today\'s articles</h2>")\r\n',
159 ' showdate = frog.util.isodatestr() \r\n',
160 ' entries = readArticlesFromDate(showdate)\r\n',
161 'elif arg=="active":\r\n',
162 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
163 ' self.Yredirect("active.y")\r\n',
164 'elif arg=="login":\r\n',
165 ' #-------------------- LOGIN PAGE redirect\r\n',
166 ' self.Yredirect("login.y")\r\n',
167 'elif arg=="date":\r\n',
168 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
169 ' showdate = self.Request.getParameter("date")\r\n',
170 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
171 ' entries = readArticlesFromDate(showdate)\r\n',
172 'else:\r\n',
173 ' #-------------------- RECENT ARTICLES\r\n',
174 ' self.write("<h2>Recent articles</h2>")\r\n',
175 ' dates=storageEngine.listBlogEntryDates()\r\n',
176 ' if dates:\r\n',
177 ' entries=[]\r\n',
178 ' SHOWAMOUNT=10\r\n',
179 ' for showdate in dates:\r\n',
180 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
181 ' if len(entries)>=SHOWAMOUNT:\r\n',
182 ' break\r\n',
183 ' \r\n',
184 ]
185 stream = StringIO.StringIO("".join(s).encode(self.encoding))
186 reader = codecs.getreader(self.encoding)(stream)
187 for (i, line) in enumerate(reader):
188 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000189
190 def test_readlinequeue(self):
191 q = Queue()
192 writer = codecs.getwriter(self.encoding)(q)
193 reader = codecs.getreader(self.encoding)(q)
194
195 # No lineends
196 writer.write(u"foo\r")
197 self.assertEqual(reader.readline(keepends=False), u"foo")
198 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000199 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000200 self.assertEqual(reader.readline(keepends=False), u"bar")
201 writer.write(u"baz")
202 self.assertEqual(reader.readline(keepends=False), u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"")
204
205 # Lineends
206 writer.write(u"foo\r")
207 self.assertEqual(reader.readline(keepends=True), u"foo\r")
208 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000209 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000210 self.assertEqual(reader.readline(keepends=True), u"bar\r")
211 writer.write(u"baz")
212 self.assertEqual(reader.readline(keepends=True), u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"")
214 writer.write(u"foo\r\n")
215 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
216
Walter Dörwald9fa09462005-01-10 12:01:39 +0000217 def test_bug1098990_a(self):
218 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
219 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
220 s3 = u"next line.\r\n"
221
222 s = (s1+s2+s3).encode(self.encoding)
223 stream = StringIO.StringIO(s)
224 reader = codecs.getreader(self.encoding)(stream)
225 self.assertEqual(reader.readline(), s1)
226 self.assertEqual(reader.readline(), s2)
227 self.assertEqual(reader.readline(), s3)
228 self.assertEqual(reader.readline(), u"")
229
230 def test_bug1098990_b(self):
231 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
232 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
233 s3 = u"stillokay:bbbbxx\r\n"
234 s4 = u"broken!!!!badbad\r\n"
235 s5 = u"againokay.\r\n"
236
237 s = (s1+s2+s3+s4+s5).encode(self.encoding)
238 stream = StringIO.StringIO(s)
239 reader = codecs.getreader(self.encoding)(stream)
240 self.assertEqual(reader.readline(), s1)
241 self.assertEqual(reader.readline(), s2)
242 self.assertEqual(reader.readline(), s3)
243 self.assertEqual(reader.readline(), s4)
244 self.assertEqual(reader.readline(), s5)
245 self.assertEqual(reader.readline(), u"")
246
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000247class UTF16Test(ReadTest):
248 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000249
250 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
251 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
252
253 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000254 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000255 # encode some stream
256 s = StringIO.StringIO()
257 f = writer(s)
258 f.write(u"spam")
259 f.write(u"spam")
260 d = s.getvalue()
261 # check whether there is exactly one BOM in it
262 self.assert_(d == self.spamle or d == self.spambe)
263 # try to read it back
264 s = StringIO.StringIO(d)
265 f = reader(s)
266 self.assertEquals(f.read(), u"spamspam")
267
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000268 def test_badbom(self):
269 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000270 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000271 self.assertRaises(UnicodeError, f.read)
272
273 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000274 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000275 self.assertRaises(UnicodeError, f.read)
276
Walter Dörwald69652032004-09-07 20:24:22 +0000277 def test_partial(self):
278 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000279 u"\x00\xff\u0100\uffff",
280 [
281 u"", # first byte of BOM read
282 u"", # second byte of BOM read => byteorder known
283 u"",
284 u"\x00",
285 u"\x00",
286 u"\x00\xff",
287 u"\x00\xff",
288 u"\x00\xff\u0100",
289 u"\x00\xff\u0100",
290 u"\x00\xff\u0100\uffff",
291 ]
292 )
293
Walter Dörwalde22d3392005-11-17 08:52:34 +0000294 def test_errors(self):
295 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
296
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000297class UTF16LETest(ReadTest):
298 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000299
300 def test_partial(self):
301 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000302 u"\x00\xff\u0100\uffff",
303 [
304 u"",
305 u"\x00",
306 u"\x00",
307 u"\x00\xff",
308 u"\x00\xff",
309 u"\x00\xff\u0100",
310 u"\x00\xff\u0100",
311 u"\x00\xff\u0100\uffff",
312 ]
313 )
314
Walter Dörwalde22d3392005-11-17 08:52:34 +0000315 def test_errors(self):
316 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
317
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000318class UTF16BETest(ReadTest):
319 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000320
321 def test_partial(self):
322 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000323 u"\x00\xff\u0100\uffff",
324 [
325 u"",
326 u"\x00",
327 u"\x00",
328 u"\x00\xff",
329 u"\x00\xff",
330 u"\x00\xff\u0100",
331 u"\x00\xff\u0100",
332 u"\x00\xff\u0100\uffff",
333 ]
334 )
335
Walter Dörwalde22d3392005-11-17 08:52:34 +0000336 def test_errors(self):
337 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
338
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000339class UTF8Test(ReadTest):
340 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000341
342 def test_partial(self):
343 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000344 u"\x00\xff\u07ff\u0800\uffff",
345 [
346 u"\x00",
347 u"\x00",
348 u"\x00\xff",
349 u"\x00\xff",
350 u"\x00\xff\u07ff",
351 u"\x00\xff\u07ff",
352 u"\x00\xff\u07ff",
353 u"\x00\xff\u07ff\u0800",
354 u"\x00\xff\u07ff\u0800",
355 u"\x00\xff\u07ff\u0800",
356 u"\x00\xff\u07ff\u0800\uffff",
357 ]
358 )
359
Walter Dörwalde22d3392005-11-17 08:52:34 +0000360class UTF7Test(ReadTest):
361 encoding = "utf-7"
362
363 # No test_partial() yet, because UTF-7 doesn't support it.
364
365class UTF16ExTest(unittest.TestCase):
366
367 def test_errors(self):
368 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
369
370 def test_bad_args(self):
371 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
372
373class ReadBufferTest(unittest.TestCase):
374
375 def test_array(self):
376 import array
377 self.assertEqual(
378 codecs.readbuffer_encode(array.array("c", "spam")),
379 ("spam", 4)
380 )
381
382 def test_empty(self):
383 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
384
385 def test_bad_args(self):
386 self.assertRaises(TypeError, codecs.readbuffer_encode)
387 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
388
389class CharBufferTest(unittest.TestCase):
390
391 def test_string(self):
392 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
393
394 def test_empty(self):
395 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
396
397 def test_bad_args(self):
398 self.assertRaises(TypeError, codecs.charbuffer_encode)
399 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
400
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000401class UTF8SigTest(ReadTest):
402 encoding = "utf-8-sig"
403
404 def test_partial(self):
405 self.check_partial(
406 u"\ufeff\x00\xff\u07ff\u0800\uffff",
407 [
408 u"",
409 u"",
410 u"", # First BOM has been read and skipped
411 u"",
412 u"",
413 u"\ufeff", # Second BOM has been read and emitted
414 u"\ufeff\x00", # "\x00" read and emitted
415 u"\ufeff\x00", # First byte of encoded u"\xff" read
416 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
417 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
418 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
419 u"\ufeff\x00\xff\u07ff",
420 u"\ufeff\x00\xff\u07ff",
421 u"\ufeff\x00\xff\u07ff\u0800",
422 u"\ufeff\x00\xff\u07ff\u0800",
423 u"\ufeff\x00\xff\u07ff\u0800",
424 u"\ufeff\x00\xff\u07ff\u0800\uffff",
425 ]
426 )
427
Thomas Wouters89f507f2006-12-13 04:49:30 +0000428 def test_bug1601501(self):
429 # SF bug #1601501: check that the codec works with a buffer
430 unicode("\xef\xbb\xbf", "utf-8-sig")
431
Walter Dörwald8709a422002-09-03 13:53:40 +0000432class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000433 def test_empty(self):
Walter Dörwald8709a422002-09-03 13:53:40 +0000434 self.assertEquals(codecs.escape_decode(""), ("", 0))
435
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000436class RecodingTest(unittest.TestCase):
437 def test_recoding(self):
438 f = StringIO.StringIO()
439 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
440 f2.write(u"a")
441 f2.close()
442 # Python used to crash on this at exit because of a refcount
443 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000444
Martin v. Löwis2548c732003-04-18 10:39:54 +0000445# From RFC 3492
446punycode_testcases = [
447 # A Arabic (Egyptian):
448 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
449 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
450 "egbpdaj6bu4bxfgehfvwxn"),
451 # B Chinese (simplified):
452 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
453 "ihqwcrb4cv8a8dqg056pqjye"),
454 # C Chinese (traditional):
455 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
456 "ihqwctvzc91f659drss3x8bo0yb"),
457 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
458 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
459 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
460 u"\u0065\u0073\u006B\u0079",
461 "Proprostnemluvesky-uyb24dma41a"),
462 # E Hebrew:
463 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
464 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
465 u"\u05D1\u05E8\u05D9\u05EA",
466 "4dbcagdahymbxekheh6e0a7fei0b"),
467 # F Hindi (Devanagari):
468 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
469 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
470 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
471 u"\u0939\u0948\u0902",
472 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
473
474 #(G) Japanese (kanji and hiragana):
475 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
476 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
477 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
478
479 # (H) Korean (Hangul syllables):
480 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
481 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
482 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
483 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
484 "psd879ccm6fea98c"),
485
486 # (I) Russian (Cyrillic):
487 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
488 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
489 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
490 u"\u0438",
491 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
492
493 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
494 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
495 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
496 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
497 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
498 u"\u0061\u00F1\u006F\u006C",
499 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
500
501 # (K) Vietnamese:
502 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
503 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
504 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
505 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
506 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
507 u"\u0056\u0069\u1EC7\u0074",
508 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
509
Martin v. Löwis2548c732003-04-18 10:39:54 +0000510 #(L) 3<nen>B<gumi><kinpachi><sensei>
511 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
512 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000513
Martin v. Löwis2548c732003-04-18 10:39:54 +0000514 # (M) <amuro><namie>-with-SUPER-MONKEYS
515 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
516 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
517 u"\u004F\u004E\u004B\u0045\u0059\u0053",
518 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
519
520 # (N) Hello-Another-Way-<sorezore><no><basho>
521 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
522 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
523 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
524 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
525
526 # (O) <hitotsu><yane><no><shita>2
527 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
528 "2-u9tlzr9756bt3uc0v"),
529
530 # (P) Maji<de>Koi<suru>5<byou><mae>
531 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
532 u"\u308B\u0035\u79D2\u524D",
533 "MajiKoi5-783gue6qz075azm5e"),
534
535 # (Q) <pafii>de<runba>
536 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
537 "de-jg4avhby1noc0d"),
538
539 # (R) <sono><supiido><de>
540 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
541 "d9juau41awczczp"),
542
543 # (S) -> $1.00 <-
544 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
545 u"\u003C\u002D",
546 "-> $1.00 <--")
547 ]
548
549for i in punycode_testcases:
550 if len(i)!=2:
551 print repr(i)
552
553class PunycodeTest(unittest.TestCase):
554 def test_encode(self):
555 for uni, puny in punycode_testcases:
556 # Need to convert both strings to lower case, since
557 # some of the extended encodings use upper case, but our
558 # code produces only lower case. Converting just puny to
559 # lower is also insufficient, since some of the input characters
560 # are upper case.
561 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
562
563 def test_decode(self):
564 for uni, puny in punycode_testcases:
565 self.assertEquals(uni, puny.decode("punycode"))
566
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000567class UnicodeInternalTest(unittest.TestCase):
568 def test_bug1251300(self):
569 # Decoding with unicode_internal used to not correctly handle "code
570 # points" above 0x10ffff on UCS-4 builds.
571 if sys.maxunicode > 0xffff:
572 ok = [
573 ("\x00\x10\xff\xff", u"\U0010ffff"),
574 ("\x00\x00\x01\x01", u"\U00000101"),
575 ("", u""),
576 ]
577 not_ok = [
578 "\x7f\xff\xff\xff",
579 "\x80\x00\x00\x00",
580 "\x81\x00\x00\x00",
581 "\x00",
582 "\x00\x00\x00\x00\x00",
583 ]
584 for internal, uni in ok:
585 if sys.byteorder == "little":
586 internal = "".join(reversed(internal))
587 self.assertEquals(uni, internal.decode("unicode_internal"))
588 for internal in not_ok:
589 if sys.byteorder == "little":
590 internal = "".join(reversed(internal))
591 self.assertRaises(UnicodeDecodeError, internal.decode,
592 "unicode_internal")
593
594 def test_decode_error_attributes(self):
595 if sys.maxunicode > 0xffff:
596 try:
597 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
598 except UnicodeDecodeError, ex:
599 self.assertEquals("unicode_internal", ex.encoding)
600 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
601 self.assertEquals(4, ex.start)
602 self.assertEquals(8, ex.end)
603 else:
604 self.fail()
605
606 def test_decode_callback(self):
607 if sys.maxunicode > 0xffff:
608 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
609 decoder = codecs.getdecoder("unicode_internal")
610 ab = u"ab".encode("unicode_internal")
611 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
612 "UnicodeInternalTest")
613 self.assertEquals((u"ab", 12), ignored)
614
Martin v. Löwis2548c732003-04-18 10:39:54 +0000615# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
616nameprep_tests = [
617 # 3.1 Map to nothing.
618 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
619 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
620 '\xb8\x8f\xef\xbb\xbf',
621 'foobarbaz'),
622 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
623 ('CAFE',
624 'cafe'),
625 # 3.3 Case folding 8bit U+00DF (german sharp s).
626 # The original test case is bogus; it says \xc3\xdf
627 ('\xc3\x9f',
628 'ss'),
629 # 3.4 Case folding U+0130 (turkish capital I with dot).
630 ('\xc4\xb0',
631 'i\xcc\x87'),
632 # 3.5 Case folding multibyte U+0143 U+037A.
633 ('\xc5\x83\xcd\xba',
634 '\xc5\x84 \xce\xb9'),
635 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
636 # XXX: skip this as it fails in UCS-2 mode
637 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
638 # 'telc\xe2\x88\x95kg\xcf\x83'),
639 (None, None),
640 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
641 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
642 '\xc7\xb0 a'),
643 # 3.8 Case folding U+1FB7 and normalization.
644 ('\xe1\xbe\xb7',
645 '\xe1\xbe\xb6\xce\xb9'),
646 # 3.9 Self-reverting case folding U+01F0 and normalization.
647 # The original test case is bogus, it says `\xc7\xf0'
648 ('\xc7\xb0',
649 '\xc7\xb0'),
650 # 3.10 Self-reverting case folding U+0390 and normalization.
651 ('\xce\x90',
652 '\xce\x90'),
653 # 3.11 Self-reverting case folding U+03B0 and normalization.
654 ('\xce\xb0',
655 '\xce\xb0'),
656 # 3.12 Self-reverting case folding U+1E96 and normalization.
657 ('\xe1\xba\x96',
658 '\xe1\xba\x96'),
659 # 3.13 Self-reverting case folding U+1F56 and normalization.
660 ('\xe1\xbd\x96',
661 '\xe1\xbd\x96'),
662 # 3.14 ASCII space character U+0020.
663 (' ',
664 ' '),
665 # 3.15 Non-ASCII 8bit space character U+00A0.
666 ('\xc2\xa0',
667 ' '),
668 # 3.16 Non-ASCII multibyte space character U+1680.
669 ('\xe1\x9a\x80',
670 None),
671 # 3.17 Non-ASCII multibyte space character U+2000.
672 ('\xe2\x80\x80',
673 ' '),
674 # 3.18 Zero Width Space U+200b.
675 ('\xe2\x80\x8b',
676 ''),
677 # 3.19 Non-ASCII multibyte space character U+3000.
678 ('\xe3\x80\x80',
679 ' '),
680 # 3.20 ASCII control characters U+0010 U+007F.
681 ('\x10\x7f',
682 '\x10\x7f'),
683 # 3.21 Non-ASCII 8bit control character U+0085.
684 ('\xc2\x85',
685 None),
686 # 3.22 Non-ASCII multibyte control character U+180E.
687 ('\xe1\xa0\x8e',
688 None),
689 # 3.23 Zero Width No-Break Space U+FEFF.
690 ('\xef\xbb\xbf',
691 ''),
692 # 3.24 Non-ASCII control character U+1D175.
693 ('\xf0\x9d\x85\xb5',
694 None),
695 # 3.25 Plane 0 private use character U+F123.
696 ('\xef\x84\xa3',
697 None),
698 # 3.26 Plane 15 private use character U+F1234.
699 ('\xf3\xb1\x88\xb4',
700 None),
701 # 3.27 Plane 16 private use character U+10F234.
702 ('\xf4\x8f\x88\xb4',
703 None),
704 # 3.28 Non-character code point U+8FFFE.
705 ('\xf2\x8f\xbf\xbe',
706 None),
707 # 3.29 Non-character code point U+10FFFF.
708 ('\xf4\x8f\xbf\xbf',
709 None),
710 # 3.30 Surrogate code U+DF42.
711 ('\xed\xbd\x82',
712 None),
713 # 3.31 Non-plain text character U+FFFD.
714 ('\xef\xbf\xbd',
715 None),
716 # 3.32 Ideographic description character U+2FF5.
717 ('\xe2\xbf\xb5',
718 None),
719 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000720 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000721 '\xcc\x81'),
722 # 3.34 Left-to-right mark U+200E.
723 ('\xe2\x80\x8e',
724 None),
725 # 3.35 Deprecated U+202A.
726 ('\xe2\x80\xaa',
727 None),
728 # 3.36 Language tagging character U+E0001.
729 ('\xf3\xa0\x80\x81',
730 None),
731 # 3.37 Language tagging character U+E0042.
732 ('\xf3\xa0\x81\x82',
733 None),
734 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
735 ('foo\xd6\xbebar',
736 None),
737 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
738 ('foo\xef\xb5\x90bar',
739 None),
740 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
741 ('foo\xef\xb9\xb6bar',
742 'foo \xd9\x8ebar'),
743 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
744 ('\xd8\xa71',
745 None),
746 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
747 ('\xd8\xa71\xd8\xa8',
748 '\xd8\xa71\xd8\xa8'),
749 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000750 # Skip this test as we allow unassigned
751 #('\xf3\xa0\x80\x82',
752 # None),
753 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000754 # 3.44 Larger test (shrinking).
755 # Original test case reads \xc3\xdf
756 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
757 '\xaa\xce\xb0\xe2\x80\x80',
758 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
759 # 3.45 Larger test (expanding).
760 # Original test case reads \xc3\x9f
761 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
762 '\x80',
763 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
764 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
765 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
766 ]
767
768
769class NameprepTest(unittest.TestCase):
770 def test_nameprep(self):
771 from encodings.idna import nameprep
772 for pos, (orig, prepped) in enumerate(nameprep_tests):
773 if orig is None:
774 # Skipped
775 continue
776 # The Unicode strings are given in UTF-8
777 orig = unicode(orig, "utf-8")
778 if prepped is None:
779 # Input contains prohibited characters
780 self.assertRaises(UnicodeError, nameprep, orig)
781 else:
782 prepped = unicode(prepped, "utf-8")
783 try:
784 self.assertEquals(nameprep(orig), prepped)
785 except Exception,e:
786 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
787
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000788class IDNACodecTest(unittest.TestCase):
789 def test_builtin_decode(self):
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000790 self.assertEquals(unicode("python.org", "idna"), u"python.org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000791 self.assertEquals(unicode("python.org.", "idna"), u"python.org.")
792 self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
793 self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
794
795 def test_builtin_encode(self):
796 self.assertEquals(u"python.org".encode("idna"), "python.org")
797 self.assertEquals("python.org.".encode("idna"), "python.org.")
798 self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
799 self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000800
Martin v. Löwis8b595142005-08-25 11:03:38 +0000801 def test_stream(self):
802 import StringIO
803 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
804 r.read(3)
805 self.assertEquals(r.read(), u"")
806
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000807 def test_incremental_decode(self):
808 self.assertEquals(
809 "".join(codecs.iterdecode("python.org", "idna")),
810 u"python.org"
811 )
812 self.assertEquals(
813 "".join(codecs.iterdecode("python.org.", "idna")),
814 u"python.org."
815 )
816 self.assertEquals(
817 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
818 u"pyth\xf6n.org."
819 )
820 self.assertEquals(
821 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
822 u"pyth\xf6n.org."
823 )
824
825 decoder = codecs.getincrementaldecoder("idna")()
826 self.assertEquals(decoder.decode("xn--xam", ), u"")
827 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
828 self.assertEquals(decoder.decode(u"rg"), u"")
829 self.assertEquals(decoder.decode(u"", True), u"org")
830
831 decoder.reset()
832 self.assertEquals(decoder.decode("xn--xam", ), u"")
833 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
834 self.assertEquals(decoder.decode("rg."), u"org.")
835 self.assertEquals(decoder.decode("", True), u"")
836
837 def test_incremental_encode(self):
838 self.assertEquals(
839 "".join(codecs.iterencode(u"python.org", "idna")),
840 "python.org"
841 )
842 self.assertEquals(
843 "".join(codecs.iterencode(u"python.org.", "idna")),
844 "python.org."
845 )
846 self.assertEquals(
847 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
848 "xn--pythn-mua.org."
849 )
850 self.assertEquals(
851 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
852 "xn--pythn-mua.org."
853 )
854
855 encoder = codecs.getincrementalencoder("idna")()
856 self.assertEquals(encoder.encode(u"\xe4x"), "")
857 self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
858 self.assertEquals(encoder.encode(u"", True), "org")
859
860 encoder.reset()
861 self.assertEquals(encoder.encode(u"\xe4x"), "")
862 self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
863 self.assertEquals(encoder.encode(u"", True), "")
864
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000865class CodecsModuleTest(unittest.TestCase):
866
867 def test_decode(self):
868 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
869 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000870 self.assertRaises(TypeError, codecs.decode)
871 self.assertEquals(codecs.decode('abc'), u'abc')
872 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
873
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000874 def test_encode(self):
875 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
876 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000877 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +0000878 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwald063e1e82004-10-28 13:04:26 +0000879 self.assertEquals(codecs.encode(u'abc'), 'abc')
880 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
881
882 def test_register(self):
883 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +0000884 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +0000885
886 def test_lookup(self):
887 self.assertRaises(TypeError, codecs.lookup)
888 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +0000889 self.assertRaises(LookupError, codecs.lookup, " ")
890
891 def test_getencoder(self):
892 self.assertRaises(TypeError, codecs.getencoder)
893 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
894
895 def test_getdecoder(self):
896 self.assertRaises(TypeError, codecs.getdecoder)
897 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
898
899 def test_getreader(self):
900 self.assertRaises(TypeError, codecs.getreader)
901 self.assertRaises(LookupError, codecs.getreader, "__spam__")
902
903 def test_getwriter(self):
904 self.assertRaises(TypeError, codecs.getwriter)
905 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000906
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000907class StreamReaderTest(unittest.TestCase):
908
909 def setUp(self):
910 self.reader = codecs.getreader('utf-8')
911 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
912
913 def test_readlines(self):
914 f = self.reader(self.stream)
915 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
916
Thomas Wouters89f507f2006-12-13 04:49:30 +0000917class EncodedFileTest(unittest.TestCase):
918
919 def test_basic(self):
920 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
921 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
922 self.assertEquals(ef.read(), '\\\xd5\n\x00\x00\xae')
923
924 f = StringIO.StringIO()
925 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
926 ef.write('\xc3\xbc')
927 self.assertEquals(f.getvalue(), '\xfc')
928
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000929class Str2StrTest(unittest.TestCase):
930
931 def test_read(self):
932 sin = "\x80".encode("base64_codec")
933 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
934 sout = reader.read()
935 self.assertEqual(sout, "\x80")
936 self.assert_(isinstance(sout, str))
937
938 def test_readline(self):
939 sin = "\x80".encode("base64_codec")
940 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
941 sout = reader.readline()
942 self.assertEqual(sout, "\x80")
943 self.assert_(isinstance(sout, str))
944
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000945all_unicode_encodings = [
946 "ascii",
947 "base64_codec",
948 "big5",
949 "big5hkscs",
950 "charmap",
951 "cp037",
952 "cp1006",
953 "cp1026",
954 "cp1140",
955 "cp1250",
956 "cp1251",
957 "cp1252",
958 "cp1253",
959 "cp1254",
960 "cp1255",
961 "cp1256",
962 "cp1257",
963 "cp1258",
964 "cp424",
965 "cp437",
966 "cp500",
967 "cp737",
968 "cp775",
969 "cp850",
970 "cp852",
971 "cp855",
972 "cp856",
973 "cp857",
974 "cp860",
975 "cp861",
976 "cp862",
977 "cp863",
978 "cp864",
979 "cp865",
980 "cp866",
981 "cp869",
982 "cp874",
983 "cp875",
984 "cp932",
985 "cp949",
986 "cp950",
987 "euc_jis_2004",
988 "euc_jisx0213",
989 "euc_jp",
990 "euc_kr",
991 "gb18030",
992 "gb2312",
993 "gbk",
994 "hex_codec",
995 "hp_roman8",
996 "hz",
997 "idna",
998 "iso2022_jp",
999 "iso2022_jp_1",
1000 "iso2022_jp_2",
1001 "iso2022_jp_2004",
1002 "iso2022_jp_3",
1003 "iso2022_jp_ext",
1004 "iso2022_kr",
1005 "iso8859_1",
1006 "iso8859_10",
1007 "iso8859_11",
1008 "iso8859_13",
1009 "iso8859_14",
1010 "iso8859_15",
1011 "iso8859_16",
1012 "iso8859_2",
1013 "iso8859_3",
1014 "iso8859_4",
1015 "iso8859_5",
1016 "iso8859_6",
1017 "iso8859_7",
1018 "iso8859_8",
1019 "iso8859_9",
1020 "johab",
1021 "koi8_r",
1022 "koi8_u",
1023 "latin_1",
1024 "mac_cyrillic",
1025 "mac_greek",
1026 "mac_iceland",
1027 "mac_latin2",
1028 "mac_roman",
1029 "mac_turkish",
1030 "palmos",
1031 "ptcp154",
1032 "punycode",
1033 "raw_unicode_escape",
1034 "rot_13",
1035 "shift_jis",
1036 "shift_jis_2004",
1037 "shift_jisx0213",
1038 "tis_620",
1039 "unicode_escape",
1040 "unicode_internal",
1041 "utf_16",
1042 "utf_16_be",
1043 "utf_16_le",
1044 "utf_7",
1045 "utf_8",
1046]
1047
1048if hasattr(codecs, "mbcs_encode"):
1049 all_unicode_encodings.append("mbcs")
1050
1051# The following encodings work only with str, not unicode
1052all_string_encodings = [
1053 "quopri_codec",
1054 "string_escape",
1055 "uu_codec",
1056]
1057
1058# The following encoding is not tested, because it's not supposed
1059# to work:
1060# "undefined"
1061
1062# The following encodings don't work in stateful mode
1063broken_unicode_with_streams = [
1064 "base64_codec",
1065 "hex_codec",
1066 "punycode",
1067 "unicode_internal"
1068]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001069broken_incremental_coders = broken_unicode_with_streams[:]
1070
1071# The following encodings only support "strict" mode
1072only_strict_mode = [
1073 "idna",
1074 "zlib_codec",
1075 "bz2_codec",
1076]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001077
1078try:
1079 import bz2
1080except ImportError:
1081 pass
1082else:
1083 all_unicode_encodings.append("bz2_codec")
1084 broken_unicode_with_streams.append("bz2_codec")
1085
1086try:
1087 import zlib
1088except ImportError:
1089 pass
1090else:
1091 all_unicode_encodings.append("zlib_codec")
1092 broken_unicode_with_streams.append("zlib_codec")
1093
1094class BasicUnicodeTest(unittest.TestCase):
1095 def test_basics(self):
1096 s = u"abc123" # all codecs should be able to encode these
1097 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001098 name = codecs.lookup(encoding).name
1099 if encoding.endswith("_codec"):
1100 name += "_codec"
1101 elif encoding == "latin_1":
1102 name = "latin_1"
1103 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001104 (bytes, size) = codecs.getencoder(encoding)(s)
1105 if encoding != "unicode_internal":
1106 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1107 (chars, size) = codecs.getdecoder(encoding)(bytes)
1108 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1109
1110 if encoding not in broken_unicode_with_streams:
1111 # check stream reader/writer
1112 q = Queue()
1113 writer = codecs.getwriter(encoding)(q)
1114 encodedresult = ""
1115 for c in s:
1116 writer.write(c)
1117 encodedresult += q.read()
1118 q = Queue()
1119 reader = codecs.getreader(encoding)(q)
1120 decodedresult = u""
1121 for c in encodedresult:
1122 q.write(c)
1123 decodedresult += reader.read()
1124 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1125
Thomas Wouters89f507f2006-12-13 04:49:30 +00001126 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001127 # check incremental decoder/encoder (fetched via the Python
1128 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001129 try:
1130 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001131 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001132 except LookupError: # no IncrementalEncoder
1133 pass
1134 else:
1135 # check incremental decoder/encoder
1136 encodedresult = ""
1137 for c in s:
1138 encodedresult += encoder.encode(c)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001139 encodedresult += encoder.encode(u"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001140 decoder = codecs.getincrementaldecoder(encoding)()
1141 decodedresult = u""
1142 for c in encodedresult:
1143 decodedresult += decoder.decode(c)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001144 decodedresult += decoder.decode("", True)
1145 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1146
1147 # check C API
1148 encodedresult = ""
1149 for c in s:
1150 encodedresult += cencoder.encode(c)
1151 encodedresult += cencoder.encode(u"", True)
1152 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1153 decodedresult = u""
1154 for c in encodedresult:
1155 decodedresult += cdecoder.decode(c)
1156 decodedresult += cdecoder.decode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001157 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1158
1159 # check iterencode()/iterdecode()
1160 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1161 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1162
1163 # check iterencode()/iterdecode() with empty string
1164 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1165 self.assertEqual(result, u"")
1166
Thomas Wouters89f507f2006-12-13 04:49:30 +00001167 if encoding not in only_strict_mode:
1168 # check incremental decoder/encoder with errors argument
1169 try:
1170 encoder = codecs.getincrementalencoder(encoding)("ignore")
1171 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1172 except LookupError: # no IncrementalEncoder
1173 pass
1174 else:
1175 encodedresult = "".join(encoder.encode(c) for c in s)
1176 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1177 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1178 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1179
1180 encodedresult = "".join(cencoder.encode(c) for c in s)
1181 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1182 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1183 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1184
Walter Dörwald729c31f2005-03-14 19:06:30 +00001185 def test_seek(self):
1186 # all codecs should be able to encode these
1187 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1188 for encoding in all_unicode_encodings:
1189 if encoding == "idna": # FIXME: See SF bug #1163178
1190 continue
1191 if encoding in broken_unicode_with_streams:
1192 continue
1193 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1194 for t in xrange(5):
1195 # Test that calling seek resets the internal codec state and buffers
1196 reader.seek(0, 0)
1197 line = reader.readline()
1198 self.assertEqual(s[:len(line)], line)
1199
Walter Dörwalde22d3392005-11-17 08:52:34 +00001200 def test_bad_decode_args(self):
1201 for encoding in all_unicode_encodings:
1202 decoder = codecs.getdecoder(encoding)
1203 self.assertRaises(TypeError, decoder)
1204 if encoding not in ("idna", "punycode"):
1205 self.assertRaises(TypeError, decoder, 42)
1206
1207 def test_bad_encode_args(self):
1208 for encoding in all_unicode_encodings:
1209 encoder = codecs.getencoder(encoding)
1210 self.assertRaises(TypeError, encoder)
1211
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001212 def test_encoding_map_type_initialized(self):
1213 from encodings import cp1140
1214 # This used to crash, we are only verifying there's no crash.
1215 table_type = type(cp1140.encoding_table)
1216 self.assertEqual(table_type, table_type)
1217
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001218class BasicStrTest(unittest.TestCase):
1219 def test_basics(self):
1220 s = "abc123"
1221 for encoding in all_string_encodings:
1222 (bytes, size) = codecs.getencoder(encoding)(s)
1223 self.assertEqual(size, len(s))
1224 (chars, size) = codecs.getdecoder(encoding)(bytes)
1225 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1226
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001227class CharmapTest(unittest.TestCase):
1228 def test_decode_with_string_map(self):
1229 self.assertEquals(
1230 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1231 (u"abc", 3)
1232 )
1233
1234 self.assertEquals(
1235 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1236 (u"ab\ufffd", 3)
1237 )
1238
1239 self.assertEquals(
1240 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1241 (u"ab\ufffd", 3)
1242 )
1243
1244 self.assertEquals(
1245 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1246 (u"ab", 3)
1247 )
1248
1249 self.assertEquals(
1250 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1251 (u"ab", 3)
1252 )
1253
1254 allbytes = "".join(chr(i) for i in xrange(256))
1255 self.assertEquals(
1256 codecs.charmap_decode(allbytes, "ignore", u""),
1257 (u"", len(allbytes))
1258 )
1259
Thomas Wouters89f507f2006-12-13 04:49:30 +00001260class WithStmtTest(unittest.TestCase):
1261 def test_encodedfile(self):
1262 f = StringIO.StringIO("\xc3\xbc")
1263 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1264 self.assertEquals(ef.read(), "\xfc")
1265
1266 def test_streamreaderwriter(self):
1267 f = StringIO.StringIO("\xc3\xbc")
1268 info = codecs.lookup("utf-8")
1269 with codecs.StreamReaderWriter(f, info.streamreader,
1270 info.streamwriter, 'strict') as srw:
1271 self.assertEquals(srw.read(), u"\xfc")
1272
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001273
Fred Drake2e2be372001-09-20 21:33:42 +00001274def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001275 test_support.run_unittest(
1276 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001277 UTF16LETest,
1278 UTF16BETest,
1279 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001280 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001281 UTF7Test,
1282 UTF16ExTest,
1283 ReadBufferTest,
1284 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001285 EscapeDecodeTest,
1286 RecodingTest,
1287 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001288 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001289 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001290 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001291 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001292 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001293 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001294 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001295 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001296 BasicStrTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001297 CharmapTest,
1298 WithStmtTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001299 )
Fred Drake2e2be372001-09-20 21:33:42 +00001300
1301
1302if __name__ == "__main__":
1303 test_main()