blob: e64e78102c911a0c2aa2b3e6417cf6077d8ecba8 [file] [log] [blame]
Georg Brandl2a5a3022006-10-29 08:39:27 +00001from __future__ import with_statement
Barry Warsaw04f357c2002-07-23 19:04:11 +00002from test import test_support
3import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00004import codecs
Walter Dörwald9ae019b2006-03-18 14:22:26 +00005import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Walter Dörwald69652032004-09-07 20:24:22 +00007class Queue(object):
8 """
9 queue: write bytes at one end, read bytes from the other end
10 """
11 def __init__(self):
12 self._buffer = ""
13
14 def write(self, chars):
15 self._buffer += chars
16
17 def read(self, size=-1):
18 if size<0:
19 s = self._buffer
20 self._buffer = ""
21 return s
22 else:
23 s = self._buffer[:size]
24 self._buffer = self._buffer[size:]
25 return s
26
Walter Dörwalde57d7b12004-12-21 22:24:00 +000027class ReadTest(unittest.TestCase):
28 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000029 # get a StreamReader for the encoding and feed the bytestring version
30 # of input to the reader byte by byte. Read every available from
31 # the StreamReader and check that the results equal the appropriate
32 # entries from partialresults.
33 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000034 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000035 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000036 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000037 q.write(c)
38 result += r.read()
39 self.assertEqual(result, partialresult)
40 # check that there's nothing left in the buffers
41 self.assertEqual(r.read(), u"")
42 self.assertEqual(r.bytebuffer, "")
43 self.assertEqual(r.charbuffer, u"")
44
Walter Dörwaldabb02e52006-03-15 11:35:15 +000045 # do the check again, this time using a incremental decoder
46 d = codecs.getincrementaldecoder(self.encoding)()
47 result = u""
48 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
49 result += d.decode(c)
50 self.assertEqual(result, partialresult)
51 # check that there's nothing left in the buffers
52 self.assertEqual(d.decode("", True), u"")
53 self.assertEqual(d.buffer, "")
54
55 # Check whether the rest method works properly
56 d.reset()
57 result = u""
58 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
59 result += d.decode(c)
60 self.assertEqual(result, partialresult)
61 # check that there's nothing left in the buffers
62 self.assertEqual(d.decode("", True), u"")
63 self.assertEqual(d.buffer, "")
64
65 # check iterdecode()
66 encoded = input.encode(self.encoding)
67 self.assertEqual(
68 input,
69 u"".join(codecs.iterdecode(encoded, self.encoding))
70 )
71
Walter Dörwalde57d7b12004-12-21 22:24:00 +000072 def test_readline(self):
73 def getreader(input):
74 stream = StringIO.StringIO(input.encode(self.encoding))
75 return codecs.getreader(self.encoding)(stream)
76
Walter Dörwaldca199432006-03-06 22:39:12 +000077 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000078 reader = getreader(input)
79 lines = []
80 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000081 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000082 if not line:
83 break
84 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000085 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000086
87 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000088 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
89 sexpectednoends = u"foo|bar|baz|spam|eggs"
90 self.assertEqual(readalllines(s, True), sexpected)
91 self.assertEqual(readalllines(s, False), sexpectednoends)
92 self.assertEqual(readalllines(s, True, 10), sexpected)
93 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000094
95 # Test long lines (multiple calls to read() in readline())
96 vw = []
97 vwo = []
98 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
99 vw.append((i*200)*u"\3042" + lineend)
100 vwo.append((i*200)*u"\3042")
101 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
102 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
103
104 # Test lines where the first read might end with \r, so the
105 # reader has to look ahead whether this is a lone \r or a \r\n
106 for size in xrange(80):
107 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000108 s = 10*(size*u"a" + lineend + u"xxx\n")
109 reader = getreader(s)
110 for i in xrange(10):
111 self.assertEqual(
112 reader.readline(keepends=True),
113 size*u"a" + lineend,
114 )
115 reader = getreader(s)
116 for i in xrange(10):
117 self.assertEqual(
118 reader.readline(keepends=False),
119 size*u"a",
120 )
121
122 def test_bug1175396(self):
123 s = [
124 '<%!--===================================================\r\n',
125 ' BLOG index page: show recent articles,\r\n',
126 ' today\'s articles, or articles of a specific date.\r\n',
127 '========================================================--%>\r\n',
128 '<%@inputencoding="ISO-8859-1"%>\r\n',
129 '<%@pagetemplate=TEMPLATE.y%>\r\n',
130 '<%@import=import frog.util, frog%>\r\n',
131 '<%@import=import frog.objects%>\r\n',
132 '<%@import=from frog.storageerrors import StorageError%>\r\n',
133 '<%\r\n',
134 '\r\n',
135 'import logging\r\n',
136 'log=logging.getLogger("Snakelets.logger")\r\n',
137 '\r\n',
138 '\r\n',
139 'user=self.SessionCtx.user\r\n',
140 'storageEngine=self.SessionCtx.storageEngine\r\n',
141 '\r\n',
142 '\r\n',
143 'def readArticlesFromDate(date, count=None):\r\n',
144 ' entryids=storageEngine.listBlogEntries(date)\r\n',
145 ' entryids.reverse() # descending\r\n',
146 ' if count:\r\n',
147 ' entryids=entryids[:count]\r\n',
148 ' try:\r\n',
149 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
150 ' except StorageError,x:\r\n',
151 ' log.error("Error loading articles: "+str(x))\r\n',
152 ' self.abort("cannot load articles")\r\n',
153 '\r\n',
154 'showdate=None\r\n',
155 '\r\n',
156 'arg=self.Request.getArg()\r\n',
157 'if arg=="today":\r\n',
158 ' #-------------------- TODAY\'S ARTICLES\r\n',
159 ' self.write("<h2>Today\'s articles</h2>")\r\n',
160 ' showdate = frog.util.isodatestr() \r\n',
161 ' entries = readArticlesFromDate(showdate)\r\n',
162 'elif arg=="active":\r\n',
163 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
164 ' self.Yredirect("active.y")\r\n',
165 'elif arg=="login":\r\n',
166 ' #-------------------- LOGIN PAGE redirect\r\n',
167 ' self.Yredirect("login.y")\r\n',
168 'elif arg=="date":\r\n',
169 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
170 ' showdate = self.Request.getParameter("date")\r\n',
171 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
172 ' entries = readArticlesFromDate(showdate)\r\n',
173 'else:\r\n',
174 ' #-------------------- RECENT ARTICLES\r\n',
175 ' self.write("<h2>Recent articles</h2>")\r\n',
176 ' dates=storageEngine.listBlogEntryDates()\r\n',
177 ' if dates:\r\n',
178 ' entries=[]\r\n',
179 ' SHOWAMOUNT=10\r\n',
180 ' for showdate in dates:\r\n',
181 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
182 ' if len(entries)>=SHOWAMOUNT:\r\n',
183 ' break\r\n',
184 ' \r\n',
185 ]
186 stream = StringIO.StringIO("".join(s).encode(self.encoding))
187 reader = codecs.getreader(self.encoding)(stream)
188 for (i, line) in enumerate(reader):
189 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000190
191 def test_readlinequeue(self):
192 q = Queue()
193 writer = codecs.getwriter(self.encoding)(q)
194 reader = codecs.getreader(self.encoding)(q)
195
196 # No lineends
197 writer.write(u"foo\r")
198 self.assertEqual(reader.readline(keepends=False), u"foo")
199 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000200 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000201 self.assertEqual(reader.readline(keepends=False), u"bar")
202 writer.write(u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"baz")
204 self.assertEqual(reader.readline(keepends=False), u"")
205
206 # Lineends
207 writer.write(u"foo\r")
208 self.assertEqual(reader.readline(keepends=True), u"foo\r")
209 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000210 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000211 self.assertEqual(reader.readline(keepends=True), u"bar\r")
212 writer.write(u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"baz")
214 self.assertEqual(reader.readline(keepends=True), u"")
215 writer.write(u"foo\r\n")
216 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
217
Walter Dörwald9fa09462005-01-10 12:01:39 +0000218 def test_bug1098990_a(self):
219 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
220 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
221 s3 = u"next line.\r\n"
222
223 s = (s1+s2+s3).encode(self.encoding)
224 stream = StringIO.StringIO(s)
225 reader = codecs.getreader(self.encoding)(stream)
226 self.assertEqual(reader.readline(), s1)
227 self.assertEqual(reader.readline(), s2)
228 self.assertEqual(reader.readline(), s3)
229 self.assertEqual(reader.readline(), u"")
230
231 def test_bug1098990_b(self):
232 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
233 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
234 s3 = u"stillokay:bbbbxx\r\n"
235 s4 = u"broken!!!!badbad\r\n"
236 s5 = u"againokay.\r\n"
237
238 s = (s1+s2+s3+s4+s5).encode(self.encoding)
239 stream = StringIO.StringIO(s)
240 reader = codecs.getreader(self.encoding)(stream)
241 self.assertEqual(reader.readline(), s1)
242 self.assertEqual(reader.readline(), s2)
243 self.assertEqual(reader.readline(), s3)
244 self.assertEqual(reader.readline(), s4)
245 self.assertEqual(reader.readline(), s5)
246 self.assertEqual(reader.readline(), u"")
247
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000248class UTF16Test(ReadTest):
249 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000250
251 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
252 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
253
254 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000255 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000256 # encode some stream
257 s = StringIO.StringIO()
258 f = writer(s)
259 f.write(u"spam")
260 f.write(u"spam")
261 d = s.getvalue()
262 # check whether there is exactly one BOM in it
263 self.assert_(d == self.spamle or d == self.spambe)
264 # try to read it back
265 s = StringIO.StringIO(d)
266 f = reader(s)
267 self.assertEquals(f.read(), u"spamspam")
268
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000269 def test_badbom(self):
270 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000271 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000272 self.assertRaises(UnicodeError, f.read)
273
274 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000275 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000276 self.assertRaises(UnicodeError, f.read)
277
Walter Dörwald69652032004-09-07 20:24:22 +0000278 def test_partial(self):
279 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000280 u"\x00\xff\u0100\uffff",
281 [
282 u"", # first byte of BOM read
283 u"", # second byte of BOM read => byteorder known
284 u"",
285 u"\x00",
286 u"\x00",
287 u"\x00\xff",
288 u"\x00\xff",
289 u"\x00\xff\u0100",
290 u"\x00\xff\u0100",
291 u"\x00\xff\u0100\uffff",
292 ]
293 )
294
Walter Dörwalde22d3392005-11-17 08:52:34 +0000295 def test_errors(self):
296 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
297
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000298class UTF16LETest(ReadTest):
299 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000300
301 def test_partial(self):
302 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000303 u"\x00\xff\u0100\uffff",
304 [
305 u"",
306 u"\x00",
307 u"\x00",
308 u"\x00\xff",
309 u"\x00\xff",
310 u"\x00\xff\u0100",
311 u"\x00\xff\u0100",
312 u"\x00\xff\u0100\uffff",
313 ]
314 )
315
Walter Dörwalde22d3392005-11-17 08:52:34 +0000316 def test_errors(self):
317 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
318
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000319class UTF16BETest(ReadTest):
320 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000321
322 def test_partial(self):
323 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000324 u"\x00\xff\u0100\uffff",
325 [
326 u"",
327 u"\x00",
328 u"\x00",
329 u"\x00\xff",
330 u"\x00\xff",
331 u"\x00\xff\u0100",
332 u"\x00\xff\u0100",
333 u"\x00\xff\u0100\uffff",
334 ]
335 )
336
Walter Dörwalde22d3392005-11-17 08:52:34 +0000337 def test_errors(self):
338 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
339
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000340class UTF8Test(ReadTest):
341 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000342
343 def test_partial(self):
344 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000345 u"\x00\xff\u07ff\u0800\uffff",
346 [
347 u"\x00",
348 u"\x00",
349 u"\x00\xff",
350 u"\x00\xff",
351 u"\x00\xff\u07ff",
352 u"\x00\xff\u07ff",
353 u"\x00\xff\u07ff",
354 u"\x00\xff\u07ff\u0800",
355 u"\x00\xff\u07ff\u0800",
356 u"\x00\xff\u07ff\u0800",
357 u"\x00\xff\u07ff\u0800\uffff",
358 ]
359 )
360
Walter Dörwalde22d3392005-11-17 08:52:34 +0000361class UTF7Test(ReadTest):
362 encoding = "utf-7"
363
364 # No test_partial() yet, because UTF-7 doesn't support it.
365
366class UTF16ExTest(unittest.TestCase):
367
368 def test_errors(self):
369 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
370
371 def test_bad_args(self):
372 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
373
374class ReadBufferTest(unittest.TestCase):
375
376 def test_array(self):
377 import array
378 self.assertEqual(
379 codecs.readbuffer_encode(array.array("c", "spam")),
380 ("spam", 4)
381 )
382
383 def test_empty(self):
384 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
385
386 def test_bad_args(self):
387 self.assertRaises(TypeError, codecs.readbuffer_encode)
388 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
389
390class CharBufferTest(unittest.TestCase):
391
392 def test_string(self):
393 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
394
395 def test_empty(self):
396 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
397
398 def test_bad_args(self):
399 self.assertRaises(TypeError, codecs.charbuffer_encode)
400 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
401
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000402class UTF8SigTest(ReadTest):
403 encoding = "utf-8-sig"
404
405 def test_partial(self):
406 self.check_partial(
407 u"\ufeff\x00\xff\u07ff\u0800\uffff",
408 [
409 u"",
410 u"",
411 u"", # First BOM has been read and skipped
412 u"",
413 u"",
414 u"\ufeff", # Second BOM has been read and emitted
415 u"\ufeff\x00", # "\x00" read and emitted
416 u"\ufeff\x00", # First byte of encoded u"\xff" read
417 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
418 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
419 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
420 u"\ufeff\x00\xff\u07ff",
421 u"\ufeff\x00\xff\u07ff",
422 u"\ufeff\x00\xff\u07ff\u0800",
423 u"\ufeff\x00\xff\u07ff\u0800",
424 u"\ufeff\x00\xff\u07ff\u0800",
425 u"\ufeff\x00\xff\u07ff\u0800\uffff",
426 ]
427 )
428
Walter Dörwald8709a422002-09-03 13:53:40 +0000429class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000430 def test_empty(self):
Walter Dörwald8709a422002-09-03 13:53:40 +0000431 self.assertEquals(codecs.escape_decode(""), ("", 0))
432
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000433class RecodingTest(unittest.TestCase):
434 def test_recoding(self):
435 f = StringIO.StringIO()
436 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
437 f2.write(u"a")
438 f2.close()
439 # Python used to crash on this at exit because of a refcount
440 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000441
Martin v. Löwis2548c732003-04-18 10:39:54 +0000442# From RFC 3492
443punycode_testcases = [
444 # A Arabic (Egyptian):
445 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
446 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
447 "egbpdaj6bu4bxfgehfvwxn"),
448 # B Chinese (simplified):
449 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
450 "ihqwcrb4cv8a8dqg056pqjye"),
451 # C Chinese (traditional):
452 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
453 "ihqwctvzc91f659drss3x8bo0yb"),
454 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
455 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
456 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
457 u"\u0065\u0073\u006B\u0079",
458 "Proprostnemluvesky-uyb24dma41a"),
459 # E Hebrew:
460 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
461 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
462 u"\u05D1\u05E8\u05D9\u05EA",
463 "4dbcagdahymbxekheh6e0a7fei0b"),
464 # F Hindi (Devanagari):
465 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
466 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
467 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
468 u"\u0939\u0948\u0902",
469 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
470
471 #(G) Japanese (kanji and hiragana):
472 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
473 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
474 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
475
476 # (H) Korean (Hangul syllables):
477 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
478 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
479 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
480 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
481 "psd879ccm6fea98c"),
482
483 # (I) Russian (Cyrillic):
484 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
485 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
486 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
487 u"\u0438",
488 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
489
490 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
491 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
492 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
493 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
494 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
495 u"\u0061\u00F1\u006F\u006C",
496 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
497
498 # (K) Vietnamese:
499 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
500 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
501 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
502 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
503 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
504 u"\u0056\u0069\u1EC7\u0074",
505 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
506
Martin v. Löwis2548c732003-04-18 10:39:54 +0000507 #(L) 3<nen>B<gumi><kinpachi><sensei>
508 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
509 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000510
Martin v. Löwis2548c732003-04-18 10:39:54 +0000511 # (M) <amuro><namie>-with-SUPER-MONKEYS
512 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
513 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
514 u"\u004F\u004E\u004B\u0045\u0059\u0053",
515 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
516
517 # (N) Hello-Another-Way-<sorezore><no><basho>
518 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
519 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
520 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
521 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
522
523 # (O) <hitotsu><yane><no><shita>2
524 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
525 "2-u9tlzr9756bt3uc0v"),
526
527 # (P) Maji<de>Koi<suru>5<byou><mae>
528 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
529 u"\u308B\u0035\u79D2\u524D",
530 "MajiKoi5-783gue6qz075azm5e"),
531
532 # (Q) <pafii>de<runba>
533 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
534 "de-jg4avhby1noc0d"),
535
536 # (R) <sono><supiido><de>
537 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
538 "d9juau41awczczp"),
539
540 # (S) -> $1.00 <-
541 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
542 u"\u003C\u002D",
543 "-> $1.00 <--")
544 ]
545
546for i in punycode_testcases:
547 if len(i)!=2:
548 print repr(i)
549
550class PunycodeTest(unittest.TestCase):
551 def test_encode(self):
552 for uni, puny in punycode_testcases:
553 # Need to convert both strings to lower case, since
554 # some of the extended encodings use upper case, but our
555 # code produces only lower case. Converting just puny to
556 # lower is also insufficient, since some of the input characters
557 # are upper case.
558 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
559
560 def test_decode(self):
561 for uni, puny in punycode_testcases:
562 self.assertEquals(uni, puny.decode("punycode"))
563
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000564class UnicodeInternalTest(unittest.TestCase):
565 def test_bug1251300(self):
566 # Decoding with unicode_internal used to not correctly handle "code
567 # points" above 0x10ffff on UCS-4 builds.
568 if sys.maxunicode > 0xffff:
569 ok = [
570 ("\x00\x10\xff\xff", u"\U0010ffff"),
571 ("\x00\x00\x01\x01", u"\U00000101"),
572 ("", u""),
573 ]
574 not_ok = [
575 "\x7f\xff\xff\xff",
576 "\x80\x00\x00\x00",
577 "\x81\x00\x00\x00",
578 "\x00",
579 "\x00\x00\x00\x00\x00",
580 ]
581 for internal, uni in ok:
582 if sys.byteorder == "little":
583 internal = "".join(reversed(internal))
584 self.assertEquals(uni, internal.decode("unicode_internal"))
585 for internal in not_ok:
586 if sys.byteorder == "little":
587 internal = "".join(reversed(internal))
588 self.assertRaises(UnicodeDecodeError, internal.decode,
589 "unicode_internal")
590
591 def test_decode_error_attributes(self):
592 if sys.maxunicode > 0xffff:
593 try:
594 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
595 except UnicodeDecodeError, ex:
596 self.assertEquals("unicode_internal", ex.encoding)
597 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
598 self.assertEquals(4, ex.start)
599 self.assertEquals(8, ex.end)
600 else:
601 self.fail()
602
603 def test_decode_callback(self):
604 if sys.maxunicode > 0xffff:
605 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
606 decoder = codecs.getdecoder("unicode_internal")
607 ab = u"ab".encode("unicode_internal")
608 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
609 "UnicodeInternalTest")
610 self.assertEquals((u"ab", 12), ignored)
611
Martin v. Löwis2548c732003-04-18 10:39:54 +0000612# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
613nameprep_tests = [
614 # 3.1 Map to nothing.
615 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
616 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
617 '\xb8\x8f\xef\xbb\xbf',
618 'foobarbaz'),
619 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
620 ('CAFE',
621 'cafe'),
622 # 3.3 Case folding 8bit U+00DF (german sharp s).
623 # The original test case is bogus; it says \xc3\xdf
624 ('\xc3\x9f',
625 'ss'),
626 # 3.4 Case folding U+0130 (turkish capital I with dot).
627 ('\xc4\xb0',
628 'i\xcc\x87'),
629 # 3.5 Case folding multibyte U+0143 U+037A.
630 ('\xc5\x83\xcd\xba',
631 '\xc5\x84 \xce\xb9'),
632 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
633 # XXX: skip this as it fails in UCS-2 mode
634 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
635 # 'telc\xe2\x88\x95kg\xcf\x83'),
636 (None, None),
637 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
638 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
639 '\xc7\xb0 a'),
640 # 3.8 Case folding U+1FB7 and normalization.
641 ('\xe1\xbe\xb7',
642 '\xe1\xbe\xb6\xce\xb9'),
643 # 3.9 Self-reverting case folding U+01F0 and normalization.
644 # The original test case is bogus, it says `\xc7\xf0'
645 ('\xc7\xb0',
646 '\xc7\xb0'),
647 # 3.10 Self-reverting case folding U+0390 and normalization.
648 ('\xce\x90',
649 '\xce\x90'),
650 # 3.11 Self-reverting case folding U+03B0 and normalization.
651 ('\xce\xb0',
652 '\xce\xb0'),
653 # 3.12 Self-reverting case folding U+1E96 and normalization.
654 ('\xe1\xba\x96',
655 '\xe1\xba\x96'),
656 # 3.13 Self-reverting case folding U+1F56 and normalization.
657 ('\xe1\xbd\x96',
658 '\xe1\xbd\x96'),
659 # 3.14 ASCII space character U+0020.
660 (' ',
661 ' '),
662 # 3.15 Non-ASCII 8bit space character U+00A0.
663 ('\xc2\xa0',
664 ' '),
665 # 3.16 Non-ASCII multibyte space character U+1680.
666 ('\xe1\x9a\x80',
667 None),
668 # 3.17 Non-ASCII multibyte space character U+2000.
669 ('\xe2\x80\x80',
670 ' '),
671 # 3.18 Zero Width Space U+200b.
672 ('\xe2\x80\x8b',
673 ''),
674 # 3.19 Non-ASCII multibyte space character U+3000.
675 ('\xe3\x80\x80',
676 ' '),
677 # 3.20 ASCII control characters U+0010 U+007F.
678 ('\x10\x7f',
679 '\x10\x7f'),
680 # 3.21 Non-ASCII 8bit control character U+0085.
681 ('\xc2\x85',
682 None),
683 # 3.22 Non-ASCII multibyte control character U+180E.
684 ('\xe1\xa0\x8e',
685 None),
686 # 3.23 Zero Width No-Break Space U+FEFF.
687 ('\xef\xbb\xbf',
688 ''),
689 # 3.24 Non-ASCII control character U+1D175.
690 ('\xf0\x9d\x85\xb5',
691 None),
692 # 3.25 Plane 0 private use character U+F123.
693 ('\xef\x84\xa3',
694 None),
695 # 3.26 Plane 15 private use character U+F1234.
696 ('\xf3\xb1\x88\xb4',
697 None),
698 # 3.27 Plane 16 private use character U+10F234.
699 ('\xf4\x8f\x88\xb4',
700 None),
701 # 3.28 Non-character code point U+8FFFE.
702 ('\xf2\x8f\xbf\xbe',
703 None),
704 # 3.29 Non-character code point U+10FFFF.
705 ('\xf4\x8f\xbf\xbf',
706 None),
707 # 3.30 Surrogate code U+DF42.
708 ('\xed\xbd\x82',
709 None),
710 # 3.31 Non-plain text character U+FFFD.
711 ('\xef\xbf\xbd',
712 None),
713 # 3.32 Ideographic description character U+2FF5.
714 ('\xe2\xbf\xb5',
715 None),
716 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000717 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000718 '\xcc\x81'),
719 # 3.34 Left-to-right mark U+200E.
720 ('\xe2\x80\x8e',
721 None),
722 # 3.35 Deprecated U+202A.
723 ('\xe2\x80\xaa',
724 None),
725 # 3.36 Language tagging character U+E0001.
726 ('\xf3\xa0\x80\x81',
727 None),
728 # 3.37 Language tagging character U+E0042.
729 ('\xf3\xa0\x81\x82',
730 None),
731 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
732 ('foo\xd6\xbebar',
733 None),
734 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
735 ('foo\xef\xb5\x90bar',
736 None),
737 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
738 ('foo\xef\xb9\xb6bar',
739 'foo \xd9\x8ebar'),
740 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
741 ('\xd8\xa71',
742 None),
743 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
744 ('\xd8\xa71\xd8\xa8',
745 '\xd8\xa71\xd8\xa8'),
746 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000747 # Skip this test as we allow unassigned
748 #('\xf3\xa0\x80\x82',
749 # None),
750 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000751 # 3.44 Larger test (shrinking).
752 # Original test case reads \xc3\xdf
753 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
754 '\xaa\xce\xb0\xe2\x80\x80',
755 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
756 # 3.45 Larger test (expanding).
757 # Original test case reads \xc3\x9f
758 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
759 '\x80',
760 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
761 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
762 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
763 ]
764
765
766class NameprepTest(unittest.TestCase):
767 def test_nameprep(self):
768 from encodings.idna import nameprep
769 for pos, (orig, prepped) in enumerate(nameprep_tests):
770 if orig is None:
771 # Skipped
772 continue
773 # The Unicode strings are given in UTF-8
774 orig = unicode(orig, "utf-8")
775 if prepped is None:
776 # Input contains prohibited characters
777 self.assertRaises(UnicodeError, nameprep, orig)
778 else:
779 prepped = unicode(prepped, "utf-8")
780 try:
781 self.assertEquals(nameprep(orig), prepped)
782 except Exception,e:
783 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
784
Walter Dörwald78a0be62006-04-14 18:25:39 +0000785class IDNACodecTest(unittest.TestCase):
786 def test_builtin_decode(self):
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000787 self.assertEquals(unicode("python.org", "idna"), u"python.org")
Walter Dörwald78a0be62006-04-14 18:25:39 +0000788 self.assertEquals(unicode("python.org.", "idna"), u"python.org.")
789 self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
790 self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
791
792 def test_builtin_encode(self):
793 self.assertEquals(u"python.org".encode("idna"), "python.org")
794 self.assertEquals("python.org.".encode("idna"), "python.org.")
795 self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
796 self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000797
Martin v. Löwis8b595142005-08-25 11:03:38 +0000798 def test_stream(self):
799 import StringIO
800 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
801 r.read(3)
802 self.assertEquals(r.read(), u"")
803
Walter Dörwald78a0be62006-04-14 18:25:39 +0000804 def test_incremental_decode(self):
805 self.assertEquals(
806 "".join(codecs.iterdecode("python.org", "idna")),
807 u"python.org"
808 )
809 self.assertEquals(
810 "".join(codecs.iterdecode("python.org.", "idna")),
811 u"python.org."
812 )
813 self.assertEquals(
814 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
815 u"pyth\xf6n.org."
816 )
817 self.assertEquals(
818 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
819 u"pyth\xf6n.org."
820 )
821
822 decoder = codecs.getincrementaldecoder("idna")()
823 self.assertEquals(decoder.decode("xn--xam", ), u"")
824 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
825 self.assertEquals(decoder.decode(u"rg"), u"")
826 self.assertEquals(decoder.decode(u"", True), u"org")
827
828 decoder.reset()
829 self.assertEquals(decoder.decode("xn--xam", ), u"")
830 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
831 self.assertEquals(decoder.decode("rg."), u"org.")
832 self.assertEquals(decoder.decode("", True), u"")
833
834 def test_incremental_encode(self):
835 self.assertEquals(
836 "".join(codecs.iterencode(u"python.org", "idna")),
837 "python.org"
838 )
839 self.assertEquals(
840 "".join(codecs.iterencode(u"python.org.", "idna")),
841 "python.org."
842 )
843 self.assertEquals(
844 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
845 "xn--pythn-mua.org."
846 )
847 self.assertEquals(
848 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
849 "xn--pythn-mua.org."
850 )
851
852 encoder = codecs.getincrementalencoder("idna")()
853 self.assertEquals(encoder.encode(u"\xe4x"), "")
854 self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
855 self.assertEquals(encoder.encode(u"", True), "org")
856
857 encoder.reset()
858 self.assertEquals(encoder.encode(u"\xe4x"), "")
859 self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
860 self.assertEquals(encoder.encode(u"", True), "")
861
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000862class CodecsModuleTest(unittest.TestCase):
863
864 def test_decode(self):
865 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
866 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000867 self.assertRaises(TypeError, codecs.decode)
868 self.assertEquals(codecs.decode('abc'), u'abc')
869 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
870
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000871 def test_encode(self):
872 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
873 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000874 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +0000875 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwald063e1e82004-10-28 13:04:26 +0000876 self.assertEquals(codecs.encode(u'abc'), 'abc')
877 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
878
879 def test_register(self):
880 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +0000881 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +0000882
883 def test_lookup(self):
884 self.assertRaises(TypeError, codecs.lookup)
885 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +0000886 self.assertRaises(LookupError, codecs.lookup, " ")
887
888 def test_getencoder(self):
889 self.assertRaises(TypeError, codecs.getencoder)
890 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
891
892 def test_getdecoder(self):
893 self.assertRaises(TypeError, codecs.getdecoder)
894 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
895
896 def test_getreader(self):
897 self.assertRaises(TypeError, codecs.getreader)
898 self.assertRaises(LookupError, codecs.getreader, "__spam__")
899
900 def test_getwriter(self):
901 self.assertRaises(TypeError, codecs.getwriter)
902 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000903
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000904class StreamReaderTest(unittest.TestCase):
905
906 def setUp(self):
907 self.reader = codecs.getreader('utf-8')
908 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
909
910 def test_readlines(self):
911 f = self.reader(self.stream)
912 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
913
Georg Brandl2a5a3022006-10-29 08:39:27 +0000914class EncodedFileTest(unittest.TestCase):
915
916 def test_basic(self):
917 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandlb8205a12006-10-29 09:32:19 +0000918 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Georg Brandl2a5a3022006-10-29 08:39:27 +0000919 self.assertEquals(ef.read(), '\xff\xfe\\\xd5\n\x00\x00\xae')
920
921 f = StringIO.StringIO()
922 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
923 ef.write('\xc3\xbc')
924 self.assertEquals(f.getvalue(), '\xfc')
925
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000926class Str2StrTest(unittest.TestCase):
927
928 def test_read(self):
929 sin = "\x80".encode("base64_codec")
930 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
931 sout = reader.read()
932 self.assertEqual(sout, "\x80")
933 self.assert_(isinstance(sout, str))
934
935 def test_readline(self):
936 sin = "\x80".encode("base64_codec")
937 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
938 sout = reader.readline()
939 self.assertEqual(sout, "\x80")
940 self.assert_(isinstance(sout, str))
941
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000942all_unicode_encodings = [
943 "ascii",
944 "base64_codec",
945 "big5",
946 "big5hkscs",
947 "charmap",
948 "cp037",
949 "cp1006",
950 "cp1026",
951 "cp1140",
952 "cp1250",
953 "cp1251",
954 "cp1252",
955 "cp1253",
956 "cp1254",
957 "cp1255",
958 "cp1256",
959 "cp1257",
960 "cp1258",
961 "cp424",
962 "cp437",
963 "cp500",
964 "cp737",
965 "cp775",
966 "cp850",
967 "cp852",
968 "cp855",
969 "cp856",
970 "cp857",
971 "cp860",
972 "cp861",
973 "cp862",
974 "cp863",
975 "cp864",
976 "cp865",
977 "cp866",
978 "cp869",
979 "cp874",
980 "cp875",
981 "cp932",
982 "cp949",
983 "cp950",
984 "euc_jis_2004",
985 "euc_jisx0213",
986 "euc_jp",
987 "euc_kr",
988 "gb18030",
989 "gb2312",
990 "gbk",
991 "hex_codec",
992 "hp_roman8",
993 "hz",
994 "idna",
995 "iso2022_jp",
996 "iso2022_jp_1",
997 "iso2022_jp_2",
998 "iso2022_jp_2004",
999 "iso2022_jp_3",
1000 "iso2022_jp_ext",
1001 "iso2022_kr",
1002 "iso8859_1",
1003 "iso8859_10",
1004 "iso8859_11",
1005 "iso8859_13",
1006 "iso8859_14",
1007 "iso8859_15",
1008 "iso8859_16",
1009 "iso8859_2",
1010 "iso8859_3",
1011 "iso8859_4",
1012 "iso8859_5",
1013 "iso8859_6",
1014 "iso8859_7",
1015 "iso8859_8",
1016 "iso8859_9",
1017 "johab",
1018 "koi8_r",
1019 "koi8_u",
1020 "latin_1",
1021 "mac_cyrillic",
1022 "mac_greek",
1023 "mac_iceland",
1024 "mac_latin2",
1025 "mac_roman",
1026 "mac_turkish",
1027 "palmos",
1028 "ptcp154",
1029 "punycode",
1030 "raw_unicode_escape",
1031 "rot_13",
1032 "shift_jis",
1033 "shift_jis_2004",
1034 "shift_jisx0213",
1035 "tis_620",
1036 "unicode_escape",
1037 "unicode_internal",
1038 "utf_16",
1039 "utf_16_be",
1040 "utf_16_le",
1041 "utf_7",
1042 "utf_8",
1043]
1044
1045if hasattr(codecs, "mbcs_encode"):
1046 all_unicode_encodings.append("mbcs")
1047
1048# The following encodings work only with str, not unicode
1049all_string_encodings = [
1050 "quopri_codec",
1051 "string_escape",
1052 "uu_codec",
1053]
1054
1055# The following encoding is not tested, because it's not supposed
1056# to work:
1057# "undefined"
1058
1059# The following encodings don't work in stateful mode
1060broken_unicode_with_streams = [
1061 "base64_codec",
1062 "hex_codec",
1063 "punycode",
1064 "unicode_internal"
1065]
Georg Brandlc68d2cc2006-10-29 14:39:13 +00001066broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001067
1068try:
1069 import bz2
1070except ImportError:
1071 pass
1072else:
1073 all_unicode_encodings.append("bz2_codec")
1074 broken_unicode_with_streams.append("bz2_codec")
1075
1076try:
1077 import zlib
1078except ImportError:
1079 pass
1080else:
1081 all_unicode_encodings.append("zlib_codec")
1082 broken_unicode_with_streams.append("zlib_codec")
1083
1084class BasicUnicodeTest(unittest.TestCase):
1085 def test_basics(self):
1086 s = u"abc123" # all codecs should be able to encode these
1087 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001088 name = codecs.lookup(encoding).name
1089 if encoding.endswith("_codec"):
1090 name += "_codec"
1091 elif encoding == "latin_1":
1092 name = "latin_1"
1093 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001094 (bytes, size) = codecs.getencoder(encoding)(s)
1095 if encoding != "unicode_internal":
1096 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1097 (chars, size) = codecs.getdecoder(encoding)(bytes)
1098 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1099
1100 if encoding not in broken_unicode_with_streams:
1101 # check stream reader/writer
1102 q = Queue()
1103 writer = codecs.getwriter(encoding)(q)
1104 encodedresult = ""
1105 for c in s:
1106 writer.write(c)
1107 encodedresult += q.read()
1108 q = Queue()
1109 reader = codecs.getreader(encoding)(q)
1110 decodedresult = u""
1111 for c in encodedresult:
1112 q.write(c)
1113 decodedresult += reader.read()
1114 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1115
Georg Brandlc68d2cc2006-10-29 14:39:13 +00001116 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001117 # check incremental decoder/encoder (fetched via the Python
1118 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001119 try:
1120 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001121 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001122 except LookupError: # no IncrementalEncoder
1123 pass
1124 else:
1125 # check incremental decoder/encoder
1126 encodedresult = ""
1127 for c in s:
1128 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001129 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001130 decoder = codecs.getincrementaldecoder(encoding)()
1131 decodedresult = u""
1132 for c in encodedresult:
1133 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001134 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001135 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1136
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001137 # check C API
1138 encodedresult = ""
1139 for c in s:
1140 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001141 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001142 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1143 decodedresult = u""
1144 for c in encodedresult:
1145 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001146 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001147 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1148
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001149 # check iterencode()/iterdecode()
1150 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1151 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1152
1153 # check iterencode()/iterdecode() with empty string
1154 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1155 self.assertEqual(result, u"")
1156
Walter Dörwald729c31f2005-03-14 19:06:30 +00001157 def test_seek(self):
1158 # all codecs should be able to encode these
1159 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1160 for encoding in all_unicode_encodings:
1161 if encoding == "idna": # FIXME: See SF bug #1163178
1162 continue
1163 if encoding in broken_unicode_with_streams:
1164 continue
1165 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1166 for t in xrange(5):
1167 # Test that calling seek resets the internal codec state and buffers
1168 reader.seek(0, 0)
1169 line = reader.readline()
1170 self.assertEqual(s[:len(line)], line)
1171
Walter Dörwalde22d3392005-11-17 08:52:34 +00001172 def test_bad_decode_args(self):
1173 for encoding in all_unicode_encodings:
1174 decoder = codecs.getdecoder(encoding)
1175 self.assertRaises(TypeError, decoder)
1176 if encoding not in ("idna", "punycode"):
1177 self.assertRaises(TypeError, decoder, 42)
1178
1179 def test_bad_encode_args(self):
1180 for encoding in all_unicode_encodings:
1181 encoder = codecs.getencoder(encoding)
1182 self.assertRaises(TypeError, encoder)
1183
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001184 def test_encoding_map_type_initialized(self):
1185 from encodings import cp1140
1186 # This used to crash, we are only verifying there's no crash.
1187 table_type = type(cp1140.encoding_table)
1188 self.assertEqual(table_type, table_type)
1189
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001190class BasicStrTest(unittest.TestCase):
1191 def test_basics(self):
1192 s = "abc123"
1193 for encoding in all_string_encodings:
1194 (bytes, size) = codecs.getencoder(encoding)(s)
1195 self.assertEqual(size, len(s))
1196 (chars, size) = codecs.getdecoder(encoding)(bytes)
1197 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1198
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001199class CharmapTest(unittest.TestCase):
1200 def test_decode_with_string_map(self):
1201 self.assertEquals(
1202 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1203 (u"abc", 3)
1204 )
1205
1206 self.assertEquals(
1207 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1208 (u"ab\ufffd", 3)
1209 )
1210
1211 self.assertEquals(
1212 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1213 (u"ab\ufffd", 3)
1214 )
1215
1216 self.assertEquals(
1217 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1218 (u"ab", 3)
1219 )
1220
1221 self.assertEquals(
1222 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1223 (u"ab", 3)
1224 )
1225
1226 allbytes = "".join(chr(i) for i in xrange(256))
1227 self.assertEquals(
1228 codecs.charmap_decode(allbytes, "ignore", u""),
1229 (u"", len(allbytes))
1230 )
1231
Georg Brandl2a5a3022006-10-29 08:39:27 +00001232class WithStmtTest(unittest.TestCase):
1233 def test_encodedfile(self):
1234 f = StringIO.StringIO("\xc3\xbc")
1235 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1236 self.assertEquals(ef.read(), "\xfc")
1237
1238 def test_streamreaderwriter(self):
1239 f = StringIO.StringIO("\xc3\xbc")
1240 info = codecs.lookup("utf-8")
1241 with codecs.StreamReaderWriter(f, info.streamreader,
1242 info.streamwriter, 'strict') as srw:
1243 self.assertEquals(srw.read(), u"\xfc")
1244
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001245
Fred Drake2e2be372001-09-20 21:33:42 +00001246def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001247 test_support.run_unittest(
1248 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001249 UTF16LETest,
1250 UTF16BETest,
1251 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001252 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001253 UTF7Test,
1254 UTF16ExTest,
1255 ReadBufferTest,
1256 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001257 EscapeDecodeTest,
1258 RecodingTest,
1259 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001260 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001261 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00001262 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001263 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001264 StreamReaderTest,
Georg Brandl2a5a3022006-10-29 08:39:27 +00001265 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001266 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001267 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001268 BasicStrTest,
Georg Brandl2a5a3022006-10-29 08:39:27 +00001269 CharmapTest,
1270 WithStmtTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001271 )
Fred Drake2e2be372001-09-20 21:33:42 +00001272
1273
1274if __name__ == "__main__":
1275 test_main()