blob: f7a97890bdf1214637b4acfb7b5a7fd853b4a7ee [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
12
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwald3abcb012007-04-16 22:10:50 +000026class MixInCheckStateHandling:
27 def check_state_handling_decode(self, encoding, u, s):
28 for i in xrange(len(s)+1):
29 d = codecs.getincrementaldecoder(encoding)()
30 part1 = d.decode(s[:i])
31 state = d.getstate()
32 self.assert_(isinstance(state[1], int))
33 # Check that the condition stated in the documentation for
34 # IncrementalDecoder.getstate() holds
35 if not state[1]:
36 # reset decoder to the default state without anything buffered
37 d.setstate((state[0][:0], 0))
38 # Feeding the previous input may not produce any output
39 self.assert_(not d.decode(state[0]))
40 # The decoder must return to the same state
41 self.assertEqual(state, d.getstate())
42 # Create a new decoder and set it to the state
43 # we extracted from the old one
44 d = codecs.getincrementaldecoder(encoding)()
45 d.setstate(state)
46 part2 = d.decode(s[i:], True)
47 self.assertEqual(u, part1+part2)
48
49 def check_state_handling_encode(self, encoding, u, s):
50 for i in xrange(len(u)+1):
51 d = codecs.getincrementalencoder(encoding)()
52 part1 = d.encode(u[:i])
53 state = d.getstate()
54 d = codecs.getincrementalencoder(encoding)()
55 d.setstate(state)
56 part2 = d.encode(u[i:], True)
57 self.assertEqual(s, part1+part2)
58
59class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000060 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000061 # get a StreamReader for the encoding and feed the bytestring version
62 # of input to the reader byte by byte. Read every available from
63 # the StreamReader and check that the results equal the appropriate
64 # entries from partialresults.
65 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000066 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000067 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000068 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000069 q.write(c)
70 result += r.read()
71 self.assertEqual(result, partialresult)
72 # check that there's nothing left in the buffers
73 self.assertEqual(r.read(), u"")
74 self.assertEqual(r.bytebuffer, "")
75 self.assertEqual(r.charbuffer, u"")
76
Thomas Woutersa9773292006-04-21 09:43:23 +000077 # do the check again, this time using a incremental decoder
78 d = codecs.getincrementaldecoder(self.encoding)()
79 result = u""
80 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
81 result += d.decode(c)
82 self.assertEqual(result, partialresult)
83 # check that there's nothing left in the buffers
84 self.assertEqual(d.decode("", True), u"")
85 self.assertEqual(d.buffer, "")
86
87 # Check whether the rest method works properly
88 d.reset()
89 result = u""
90 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
91 result += d.decode(c)
92 self.assertEqual(result, partialresult)
93 # check that there's nothing left in the buffers
94 self.assertEqual(d.decode("", True), u"")
95 self.assertEqual(d.buffer, "")
96
97 # check iterdecode()
98 encoded = input.encode(self.encoding)
99 self.assertEqual(
100 input,
101 u"".join(codecs.iterdecode(encoded, self.encoding))
102 )
103
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000104 def test_readline(self):
105 def getreader(input):
106 stream = StringIO.StringIO(input.encode(self.encoding))
107 return codecs.getreader(self.encoding)(stream)
108
Walter Dörwaldca199432006-03-06 22:39:12 +0000109 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000110 reader = getreader(input)
111 lines = []
112 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000113 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000114 if not line:
115 break
116 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000117 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000118
119 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000120 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
121 sexpectednoends = u"foo|bar|baz|spam|eggs"
122 self.assertEqual(readalllines(s, True), sexpected)
123 self.assertEqual(readalllines(s, False), sexpectednoends)
124 self.assertEqual(readalllines(s, True, 10), sexpected)
125 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000126
127 # Test long lines (multiple calls to read() in readline())
128 vw = []
129 vwo = []
130 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
131 vw.append((i*200)*u"\3042" + lineend)
132 vwo.append((i*200)*u"\3042")
133 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
134 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
135
136 # Test lines where the first read might end with \r, so the
137 # reader has to look ahead whether this is a lone \r or a \r\n
138 for size in xrange(80):
139 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000140 s = 10*(size*u"a" + lineend + u"xxx\n")
141 reader = getreader(s)
142 for i in xrange(10):
143 self.assertEqual(
144 reader.readline(keepends=True),
145 size*u"a" + lineend,
146 )
147 reader = getreader(s)
148 for i in xrange(10):
149 self.assertEqual(
150 reader.readline(keepends=False),
151 size*u"a",
152 )
153
154 def test_bug1175396(self):
155 s = [
156 '<%!--===================================================\r\n',
157 ' BLOG index page: show recent articles,\r\n',
158 ' today\'s articles, or articles of a specific date.\r\n',
159 '========================================================--%>\r\n',
160 '<%@inputencoding="ISO-8859-1"%>\r\n',
161 '<%@pagetemplate=TEMPLATE.y%>\r\n',
162 '<%@import=import frog.util, frog%>\r\n',
163 '<%@import=import frog.objects%>\r\n',
164 '<%@import=from frog.storageerrors import StorageError%>\r\n',
165 '<%\r\n',
166 '\r\n',
167 'import logging\r\n',
168 'log=logging.getLogger("Snakelets.logger")\r\n',
169 '\r\n',
170 '\r\n',
171 'user=self.SessionCtx.user\r\n',
172 'storageEngine=self.SessionCtx.storageEngine\r\n',
173 '\r\n',
174 '\r\n',
175 'def readArticlesFromDate(date, count=None):\r\n',
176 ' entryids=storageEngine.listBlogEntries(date)\r\n',
177 ' entryids.reverse() # descending\r\n',
178 ' if count:\r\n',
179 ' entryids=entryids[:count]\r\n',
180 ' try:\r\n',
181 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
182 ' except StorageError,x:\r\n',
183 ' log.error("Error loading articles: "+str(x))\r\n',
184 ' self.abort("cannot load articles")\r\n',
185 '\r\n',
186 'showdate=None\r\n',
187 '\r\n',
188 'arg=self.Request.getArg()\r\n',
189 'if arg=="today":\r\n',
190 ' #-------------------- TODAY\'S ARTICLES\r\n',
191 ' self.write("<h2>Today\'s articles</h2>")\r\n',
192 ' showdate = frog.util.isodatestr() \r\n',
193 ' entries = readArticlesFromDate(showdate)\r\n',
194 'elif arg=="active":\r\n',
195 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
196 ' self.Yredirect("active.y")\r\n',
197 'elif arg=="login":\r\n',
198 ' #-------------------- LOGIN PAGE redirect\r\n',
199 ' self.Yredirect("login.y")\r\n',
200 'elif arg=="date":\r\n',
201 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
202 ' showdate = self.Request.getParameter("date")\r\n',
203 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
204 ' entries = readArticlesFromDate(showdate)\r\n',
205 'else:\r\n',
206 ' #-------------------- RECENT ARTICLES\r\n',
207 ' self.write("<h2>Recent articles</h2>")\r\n',
208 ' dates=storageEngine.listBlogEntryDates()\r\n',
209 ' if dates:\r\n',
210 ' entries=[]\r\n',
211 ' SHOWAMOUNT=10\r\n',
212 ' for showdate in dates:\r\n',
213 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
214 ' if len(entries)>=SHOWAMOUNT:\r\n',
215 ' break\r\n',
216 ' \r\n',
217 ]
218 stream = StringIO.StringIO("".join(s).encode(self.encoding))
219 reader = codecs.getreader(self.encoding)(stream)
220 for (i, line) in enumerate(reader):
221 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000222
223 def test_readlinequeue(self):
224 q = Queue()
225 writer = codecs.getwriter(self.encoding)(q)
226 reader = codecs.getreader(self.encoding)(q)
227
228 # No lineends
229 writer.write(u"foo\r")
230 self.assertEqual(reader.readline(keepends=False), u"foo")
231 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000232 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000233 self.assertEqual(reader.readline(keepends=False), u"bar")
234 writer.write(u"baz")
235 self.assertEqual(reader.readline(keepends=False), u"baz")
236 self.assertEqual(reader.readline(keepends=False), u"")
237
238 # Lineends
239 writer.write(u"foo\r")
240 self.assertEqual(reader.readline(keepends=True), u"foo\r")
241 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000242 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000243 self.assertEqual(reader.readline(keepends=True), u"bar\r")
244 writer.write(u"baz")
245 self.assertEqual(reader.readline(keepends=True), u"baz")
246 self.assertEqual(reader.readline(keepends=True), u"")
247 writer.write(u"foo\r\n")
248 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
249
Walter Dörwald9fa09462005-01-10 12:01:39 +0000250 def test_bug1098990_a(self):
251 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
252 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
253 s3 = u"next line.\r\n"
254
255 s = (s1+s2+s3).encode(self.encoding)
256 stream = StringIO.StringIO(s)
257 reader = codecs.getreader(self.encoding)(stream)
258 self.assertEqual(reader.readline(), s1)
259 self.assertEqual(reader.readline(), s2)
260 self.assertEqual(reader.readline(), s3)
261 self.assertEqual(reader.readline(), u"")
262
263 def test_bug1098990_b(self):
264 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
265 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
266 s3 = u"stillokay:bbbbxx\r\n"
267 s4 = u"broken!!!!badbad\r\n"
268 s5 = u"againokay.\r\n"
269
270 s = (s1+s2+s3+s4+s5).encode(self.encoding)
271 stream = StringIO.StringIO(s)
272 reader = codecs.getreader(self.encoding)(stream)
273 self.assertEqual(reader.readline(), s1)
274 self.assertEqual(reader.readline(), s2)
275 self.assertEqual(reader.readline(), s3)
276 self.assertEqual(reader.readline(), s4)
277 self.assertEqual(reader.readline(), s5)
278 self.assertEqual(reader.readline(), u"")
279
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000280class UTF16Test(ReadTest):
281 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000282
283 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
284 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
285
286 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000287 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000288 # encode some stream
289 s = StringIO.StringIO()
290 f = writer(s)
291 f.write(u"spam")
292 f.write(u"spam")
293 d = s.getvalue()
294 # check whether there is exactly one BOM in it
295 self.assert_(d == self.spamle or d == self.spambe)
296 # try to read it back
297 s = StringIO.StringIO(d)
298 f = reader(s)
299 self.assertEquals(f.read(), u"spamspam")
300
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000301 def test_badbom(self):
302 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000303 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000304 self.assertRaises(UnicodeError, f.read)
305
306 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000307 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000308 self.assertRaises(UnicodeError, f.read)
309
Walter Dörwald69652032004-09-07 20:24:22 +0000310 def test_partial(self):
311 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000312 u"\x00\xff\u0100\uffff",
313 [
314 u"", # first byte of BOM read
315 u"", # second byte of BOM read => byteorder known
316 u"",
317 u"\x00",
318 u"\x00",
319 u"\x00\xff",
320 u"\x00\xff",
321 u"\x00\xff\u0100",
322 u"\x00\xff\u0100",
323 u"\x00\xff\u0100\uffff",
324 ]
325 )
326
Walter Dörwalde22d3392005-11-17 08:52:34 +0000327 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000328 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
329 "\xff", "strict", True)
330
331 def test_decoder_state(self):
332 self.check_state_handling_decode(self.encoding,
333 u"spamspam", self.spamle)
334 self.check_state_handling_decode(self.encoding,
335 u"spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000336
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000337class UTF16LETest(ReadTest):
338 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000339
340 def test_partial(self):
341 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000342 u"\x00\xff\u0100\uffff",
343 [
344 u"",
345 u"\x00",
346 u"\x00",
347 u"\x00\xff",
348 u"\x00\xff",
349 u"\x00\xff\u0100",
350 u"\x00\xff\u0100",
351 u"\x00\xff\u0100\uffff",
352 ]
353 )
354
Walter Dörwalde22d3392005-11-17 08:52:34 +0000355 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000356 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
357 "\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000358
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000359class UTF16BETest(ReadTest):
360 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000361
362 def test_partial(self):
363 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000364 u"\x00\xff\u0100\uffff",
365 [
366 u"",
367 u"\x00",
368 u"\x00",
369 u"\x00\xff",
370 u"\x00\xff",
371 u"\x00\xff\u0100",
372 u"\x00\xff\u0100",
373 u"\x00\xff\u0100\uffff",
374 ]
375 )
376
Walter Dörwalde22d3392005-11-17 08:52:34 +0000377 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000378 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
379 "\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000380
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000381class UTF8Test(ReadTest):
382 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000383
384 def test_partial(self):
385 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000386 u"\x00\xff\u07ff\u0800\uffff",
387 [
388 u"\x00",
389 u"\x00",
390 u"\x00\xff",
391 u"\x00\xff",
392 u"\x00\xff\u07ff",
393 u"\x00\xff\u07ff",
394 u"\x00\xff\u07ff",
395 u"\x00\xff\u07ff\u0800",
396 u"\x00\xff\u07ff\u0800",
397 u"\x00\xff\u07ff\u0800",
398 u"\x00\xff\u07ff\u0800\uffff",
399 ]
400 )
401
Walter Dörwald3abcb012007-04-16 22:10:50 +0000402 def test_decoder_state(self):
403 u = u"\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
404 self.check_state_handling_decode(self.encoding,
405 u, u.encode(self.encoding))
406
Walter Dörwalde22d3392005-11-17 08:52:34 +0000407class UTF7Test(ReadTest):
408 encoding = "utf-7"
409
410 # No test_partial() yet, because UTF-7 doesn't support it.
411
412class UTF16ExTest(unittest.TestCase):
413
414 def test_errors(self):
415 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
416
417 def test_bad_args(self):
418 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
419
420class ReadBufferTest(unittest.TestCase):
421
422 def test_array(self):
423 import array
424 self.assertEqual(
425 codecs.readbuffer_encode(array.array("c", "spam")),
426 ("spam", 4)
427 )
428
429 def test_empty(self):
430 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
431
432 def test_bad_args(self):
433 self.assertRaises(TypeError, codecs.readbuffer_encode)
434 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
435
436class CharBufferTest(unittest.TestCase):
437
438 def test_string(self):
439 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
440
441 def test_empty(self):
442 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
443
444 def test_bad_args(self):
445 self.assertRaises(TypeError, codecs.charbuffer_encode)
446 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
447
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000448class UTF8SigTest(ReadTest):
449 encoding = "utf-8-sig"
450
451 def test_partial(self):
452 self.check_partial(
453 u"\ufeff\x00\xff\u07ff\u0800\uffff",
454 [
455 u"",
456 u"",
457 u"", # First BOM has been read and skipped
458 u"",
459 u"",
460 u"\ufeff", # Second BOM has been read and emitted
461 u"\ufeff\x00", # "\x00" read and emitted
462 u"\ufeff\x00", # First byte of encoded u"\xff" read
463 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
464 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
465 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
466 u"\ufeff\x00\xff\u07ff",
467 u"\ufeff\x00\xff\u07ff",
468 u"\ufeff\x00\xff\u07ff\u0800",
469 u"\ufeff\x00\xff\u07ff\u0800",
470 u"\ufeff\x00\xff\u07ff\u0800",
471 u"\ufeff\x00\xff\u07ff\u0800\uffff",
472 ]
473 )
474
Thomas Wouters89f507f2006-12-13 04:49:30 +0000475 def test_bug1601501(self):
476 # SF bug #1601501: check that the codec works with a buffer
477 unicode("\xef\xbb\xbf", "utf-8-sig")
478
Walter Dörwald3abcb012007-04-16 22:10:50 +0000479 def test_bom(self):
480 d = codecs.getincrementaldecoder("utf-8-sig")()
481 s = u"spam"
482 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
483
484 def test_decoder_state(self):
485 u = u"\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
486 self.check_state_handling_decode(self.encoding,
487 u, u.encode(self.encoding))
488
Walter Dörwald8709a422002-09-03 13:53:40 +0000489class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000490 def test_empty(self):
Walter Dörwald8709a422002-09-03 13:53:40 +0000491 self.assertEquals(codecs.escape_decode(""), ("", 0))
492
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000493class RecodingTest(unittest.TestCase):
494 def test_recoding(self):
495 f = StringIO.StringIO()
496 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
497 f2.write(u"a")
498 f2.close()
499 # Python used to crash on this at exit because of a refcount
500 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000501
Martin v. Löwis2548c732003-04-18 10:39:54 +0000502# From RFC 3492
503punycode_testcases = [
504 # A Arabic (Egyptian):
505 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
506 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
507 "egbpdaj6bu4bxfgehfvwxn"),
508 # B Chinese (simplified):
509 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
510 "ihqwcrb4cv8a8dqg056pqjye"),
511 # C Chinese (traditional):
512 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
513 "ihqwctvzc91f659drss3x8bo0yb"),
514 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
515 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
516 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
517 u"\u0065\u0073\u006B\u0079",
518 "Proprostnemluvesky-uyb24dma41a"),
519 # E Hebrew:
520 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
521 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
522 u"\u05D1\u05E8\u05D9\u05EA",
523 "4dbcagdahymbxekheh6e0a7fei0b"),
524 # F Hindi (Devanagari):
525 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
526 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
527 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
528 u"\u0939\u0948\u0902",
529 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
530
531 #(G) Japanese (kanji and hiragana):
532 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
533 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
534 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
535
536 # (H) Korean (Hangul syllables):
537 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
538 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
539 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
540 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
541 "psd879ccm6fea98c"),
542
543 # (I) Russian (Cyrillic):
544 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
545 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
546 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
547 u"\u0438",
548 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
549
550 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
551 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
552 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
553 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
554 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
555 u"\u0061\u00F1\u006F\u006C",
556 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
557
558 # (K) Vietnamese:
559 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
560 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
561 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
562 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
563 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
564 u"\u0056\u0069\u1EC7\u0074",
565 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
566
Martin v. Löwis2548c732003-04-18 10:39:54 +0000567 #(L) 3<nen>B<gumi><kinpachi><sensei>
568 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
569 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000570
Martin v. Löwis2548c732003-04-18 10:39:54 +0000571 # (M) <amuro><namie>-with-SUPER-MONKEYS
572 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
573 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
574 u"\u004F\u004E\u004B\u0045\u0059\u0053",
575 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
576
577 # (N) Hello-Another-Way-<sorezore><no><basho>
578 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
579 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
580 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
581 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
582
583 # (O) <hitotsu><yane><no><shita>2
584 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
585 "2-u9tlzr9756bt3uc0v"),
586
587 # (P) Maji<de>Koi<suru>5<byou><mae>
588 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
589 u"\u308B\u0035\u79D2\u524D",
590 "MajiKoi5-783gue6qz075azm5e"),
591
592 # (Q) <pafii>de<runba>
593 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
594 "de-jg4avhby1noc0d"),
595
596 # (R) <sono><supiido><de>
597 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
598 "d9juau41awczczp"),
599
600 # (S) -> $1.00 <-
601 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
602 u"\u003C\u002D",
603 "-> $1.00 <--")
604 ]
605
606for i in punycode_testcases:
607 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000608 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000609
610class PunycodeTest(unittest.TestCase):
611 def test_encode(self):
612 for uni, puny in punycode_testcases:
613 # Need to convert both strings to lower case, since
614 # some of the extended encodings use upper case, but our
615 # code produces only lower case. Converting just puny to
616 # lower is also insufficient, since some of the input characters
617 # are upper case.
618 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
619
620 def test_decode(self):
621 for uni, puny in punycode_testcases:
622 self.assertEquals(uni, puny.decode("punycode"))
623
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000624class UnicodeInternalTest(unittest.TestCase):
625 def test_bug1251300(self):
626 # Decoding with unicode_internal used to not correctly handle "code
627 # points" above 0x10ffff on UCS-4 builds.
628 if sys.maxunicode > 0xffff:
629 ok = [
630 ("\x00\x10\xff\xff", u"\U0010ffff"),
631 ("\x00\x00\x01\x01", u"\U00000101"),
632 ("", u""),
633 ]
634 not_ok = [
635 "\x7f\xff\xff\xff",
636 "\x80\x00\x00\x00",
637 "\x81\x00\x00\x00",
638 "\x00",
639 "\x00\x00\x00\x00\x00",
640 ]
641 for internal, uni in ok:
642 if sys.byteorder == "little":
643 internal = "".join(reversed(internal))
644 self.assertEquals(uni, internal.decode("unicode_internal"))
645 for internal in not_ok:
646 if sys.byteorder == "little":
647 internal = "".join(reversed(internal))
648 self.assertRaises(UnicodeDecodeError, internal.decode,
649 "unicode_internal")
650
651 def test_decode_error_attributes(self):
652 if sys.maxunicode > 0xffff:
653 try:
654 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +0000655 except UnicodeDecodeError as ex:
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000656 self.assertEquals("unicode_internal", ex.encoding)
657 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
658 self.assertEquals(4, ex.start)
659 self.assertEquals(8, ex.end)
660 else:
661 self.fail()
662
663 def test_decode_callback(self):
664 if sys.maxunicode > 0xffff:
665 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
666 decoder = codecs.getdecoder("unicode_internal")
667 ab = u"ab".encode("unicode_internal")
668 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
669 "UnicodeInternalTest")
670 self.assertEquals((u"ab", 12), ignored)
671
Martin v. Löwis2548c732003-04-18 10:39:54 +0000672# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
673nameprep_tests = [
674 # 3.1 Map to nothing.
675 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
676 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
677 '\xb8\x8f\xef\xbb\xbf',
678 'foobarbaz'),
679 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
680 ('CAFE',
681 'cafe'),
682 # 3.3 Case folding 8bit U+00DF (german sharp s).
683 # The original test case is bogus; it says \xc3\xdf
684 ('\xc3\x9f',
685 'ss'),
686 # 3.4 Case folding U+0130 (turkish capital I with dot).
687 ('\xc4\xb0',
688 'i\xcc\x87'),
689 # 3.5 Case folding multibyte U+0143 U+037A.
690 ('\xc5\x83\xcd\xba',
691 '\xc5\x84 \xce\xb9'),
692 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
693 # XXX: skip this as it fails in UCS-2 mode
694 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
695 # 'telc\xe2\x88\x95kg\xcf\x83'),
696 (None, None),
697 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
698 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
699 '\xc7\xb0 a'),
700 # 3.8 Case folding U+1FB7 and normalization.
701 ('\xe1\xbe\xb7',
702 '\xe1\xbe\xb6\xce\xb9'),
703 # 3.9 Self-reverting case folding U+01F0 and normalization.
704 # The original test case is bogus, it says `\xc7\xf0'
705 ('\xc7\xb0',
706 '\xc7\xb0'),
707 # 3.10 Self-reverting case folding U+0390 and normalization.
708 ('\xce\x90',
709 '\xce\x90'),
710 # 3.11 Self-reverting case folding U+03B0 and normalization.
711 ('\xce\xb0',
712 '\xce\xb0'),
713 # 3.12 Self-reverting case folding U+1E96 and normalization.
714 ('\xe1\xba\x96',
715 '\xe1\xba\x96'),
716 # 3.13 Self-reverting case folding U+1F56 and normalization.
717 ('\xe1\xbd\x96',
718 '\xe1\xbd\x96'),
719 # 3.14 ASCII space character U+0020.
720 (' ',
721 ' '),
722 # 3.15 Non-ASCII 8bit space character U+00A0.
723 ('\xc2\xa0',
724 ' '),
725 # 3.16 Non-ASCII multibyte space character U+1680.
726 ('\xe1\x9a\x80',
727 None),
728 # 3.17 Non-ASCII multibyte space character U+2000.
729 ('\xe2\x80\x80',
730 ' '),
731 # 3.18 Zero Width Space U+200b.
732 ('\xe2\x80\x8b',
733 ''),
734 # 3.19 Non-ASCII multibyte space character U+3000.
735 ('\xe3\x80\x80',
736 ' '),
737 # 3.20 ASCII control characters U+0010 U+007F.
738 ('\x10\x7f',
739 '\x10\x7f'),
740 # 3.21 Non-ASCII 8bit control character U+0085.
741 ('\xc2\x85',
742 None),
743 # 3.22 Non-ASCII multibyte control character U+180E.
744 ('\xe1\xa0\x8e',
745 None),
746 # 3.23 Zero Width No-Break Space U+FEFF.
747 ('\xef\xbb\xbf',
748 ''),
749 # 3.24 Non-ASCII control character U+1D175.
750 ('\xf0\x9d\x85\xb5',
751 None),
752 # 3.25 Plane 0 private use character U+F123.
753 ('\xef\x84\xa3',
754 None),
755 # 3.26 Plane 15 private use character U+F1234.
756 ('\xf3\xb1\x88\xb4',
757 None),
758 # 3.27 Plane 16 private use character U+10F234.
759 ('\xf4\x8f\x88\xb4',
760 None),
761 # 3.28 Non-character code point U+8FFFE.
762 ('\xf2\x8f\xbf\xbe',
763 None),
764 # 3.29 Non-character code point U+10FFFF.
765 ('\xf4\x8f\xbf\xbf',
766 None),
767 # 3.30 Surrogate code U+DF42.
768 ('\xed\xbd\x82',
769 None),
770 # 3.31 Non-plain text character U+FFFD.
771 ('\xef\xbf\xbd',
772 None),
773 # 3.32 Ideographic description character U+2FF5.
774 ('\xe2\xbf\xb5',
775 None),
776 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000777 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000778 '\xcc\x81'),
779 # 3.34 Left-to-right mark U+200E.
780 ('\xe2\x80\x8e',
781 None),
782 # 3.35 Deprecated U+202A.
783 ('\xe2\x80\xaa',
784 None),
785 # 3.36 Language tagging character U+E0001.
786 ('\xf3\xa0\x80\x81',
787 None),
788 # 3.37 Language tagging character U+E0042.
789 ('\xf3\xa0\x81\x82',
790 None),
791 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
792 ('foo\xd6\xbebar',
793 None),
794 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
795 ('foo\xef\xb5\x90bar',
796 None),
797 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
798 ('foo\xef\xb9\xb6bar',
799 'foo \xd9\x8ebar'),
800 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
801 ('\xd8\xa71',
802 None),
803 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
804 ('\xd8\xa71\xd8\xa8',
805 '\xd8\xa71\xd8\xa8'),
806 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000807 # Skip this test as we allow unassigned
808 #('\xf3\xa0\x80\x82',
809 # None),
810 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000811 # 3.44 Larger test (shrinking).
812 # Original test case reads \xc3\xdf
813 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
814 '\xaa\xce\xb0\xe2\x80\x80',
815 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
816 # 3.45 Larger test (expanding).
817 # Original test case reads \xc3\x9f
818 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
819 '\x80',
820 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
821 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
822 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
823 ]
824
825
826class NameprepTest(unittest.TestCase):
827 def test_nameprep(self):
828 from encodings.idna import nameprep
829 for pos, (orig, prepped) in enumerate(nameprep_tests):
830 if orig is None:
831 # Skipped
832 continue
833 # The Unicode strings are given in UTF-8
834 orig = unicode(orig, "utf-8")
835 if prepped is None:
836 # Input contains prohibited characters
837 self.assertRaises(UnicodeError, nameprep, orig)
838 else:
839 prepped = unicode(prepped, "utf-8")
840 try:
841 self.assertEquals(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +0000842 except Exception as e:
Martin v. Löwis2548c732003-04-18 10:39:54 +0000843 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
844
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000845class IDNACodecTest(unittest.TestCase):
846 def test_builtin_decode(self):
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000847 self.assertEquals(unicode("python.org", "idna"), u"python.org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000848 self.assertEquals(unicode("python.org.", "idna"), u"python.org.")
849 self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
850 self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
851
852 def test_builtin_encode(self):
853 self.assertEquals(u"python.org".encode("idna"), "python.org")
854 self.assertEquals("python.org.".encode("idna"), "python.org.")
855 self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
856 self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000857
Martin v. Löwis8b595142005-08-25 11:03:38 +0000858 def test_stream(self):
859 import StringIO
860 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
861 r.read(3)
862 self.assertEquals(r.read(), u"")
863
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000864 def test_incremental_decode(self):
865 self.assertEquals(
866 "".join(codecs.iterdecode("python.org", "idna")),
867 u"python.org"
868 )
869 self.assertEquals(
870 "".join(codecs.iterdecode("python.org.", "idna")),
871 u"python.org."
872 )
873 self.assertEquals(
874 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
875 u"pyth\xf6n.org."
876 )
877 self.assertEquals(
878 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
879 u"pyth\xf6n.org."
880 )
881
882 decoder = codecs.getincrementaldecoder("idna")()
883 self.assertEquals(decoder.decode("xn--xam", ), u"")
884 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
885 self.assertEquals(decoder.decode(u"rg"), u"")
886 self.assertEquals(decoder.decode(u"", True), u"org")
887
888 decoder.reset()
889 self.assertEquals(decoder.decode("xn--xam", ), u"")
890 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
891 self.assertEquals(decoder.decode("rg."), u"org.")
892 self.assertEquals(decoder.decode("", True), u"")
893
894 def test_incremental_encode(self):
895 self.assertEquals(
896 "".join(codecs.iterencode(u"python.org", "idna")),
897 "python.org"
898 )
899 self.assertEquals(
900 "".join(codecs.iterencode(u"python.org.", "idna")),
901 "python.org."
902 )
903 self.assertEquals(
904 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
905 "xn--pythn-mua.org."
906 )
907 self.assertEquals(
908 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
909 "xn--pythn-mua.org."
910 )
911
912 encoder = codecs.getincrementalencoder("idna")()
913 self.assertEquals(encoder.encode(u"\xe4x"), "")
914 self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
915 self.assertEquals(encoder.encode(u"", True), "org")
916
917 encoder.reset()
918 self.assertEquals(encoder.encode(u"\xe4x"), "")
919 self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
920 self.assertEquals(encoder.encode(u"", True), "")
921
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000922class CodecsModuleTest(unittest.TestCase):
923
924 def test_decode(self):
925 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
926 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000927 self.assertRaises(TypeError, codecs.decode)
928 self.assertEquals(codecs.decode('abc'), u'abc')
929 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
930
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000931 def test_encode(self):
932 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
933 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000934 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +0000935 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwald063e1e82004-10-28 13:04:26 +0000936 self.assertEquals(codecs.encode(u'abc'), 'abc')
937 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
938
939 def test_register(self):
940 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +0000941 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +0000942
943 def test_lookup(self):
944 self.assertRaises(TypeError, codecs.lookup)
945 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +0000946 self.assertRaises(LookupError, codecs.lookup, " ")
947
948 def test_getencoder(self):
949 self.assertRaises(TypeError, codecs.getencoder)
950 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
951
952 def test_getdecoder(self):
953 self.assertRaises(TypeError, codecs.getdecoder)
954 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
955
956 def test_getreader(self):
957 self.assertRaises(TypeError, codecs.getreader)
958 self.assertRaises(LookupError, codecs.getreader, "__spam__")
959
960 def test_getwriter(self):
961 self.assertRaises(TypeError, codecs.getwriter)
962 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000963
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000964class StreamReaderTest(unittest.TestCase):
965
966 def setUp(self):
967 self.reader = codecs.getreader('utf-8')
968 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
969
970 def test_readlines(self):
971 f = self.reader(self.stream)
972 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
973
Thomas Wouters89f507f2006-12-13 04:49:30 +0000974class EncodedFileTest(unittest.TestCase):
975
976 def test_basic(self):
977 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
978 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
979 self.assertEquals(ef.read(), '\\\xd5\n\x00\x00\xae')
980
981 f = StringIO.StringIO()
982 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
983 ef.write('\xc3\xbc')
984 self.assertEquals(f.getvalue(), '\xfc')
985
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000986class Str2StrTest(unittest.TestCase):
987
988 def test_read(self):
989 sin = "\x80".encode("base64_codec")
990 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
991 sout = reader.read()
992 self.assertEqual(sout, "\x80")
993 self.assert_(isinstance(sout, str))
994
995 def test_readline(self):
996 sin = "\x80".encode("base64_codec")
997 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
998 sout = reader.readline()
999 self.assertEqual(sout, "\x80")
1000 self.assert_(isinstance(sout, str))
1001
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001002all_unicode_encodings = [
1003 "ascii",
1004 "base64_codec",
1005 "big5",
1006 "big5hkscs",
1007 "charmap",
1008 "cp037",
1009 "cp1006",
1010 "cp1026",
1011 "cp1140",
1012 "cp1250",
1013 "cp1251",
1014 "cp1252",
1015 "cp1253",
1016 "cp1254",
1017 "cp1255",
1018 "cp1256",
1019 "cp1257",
1020 "cp1258",
1021 "cp424",
1022 "cp437",
1023 "cp500",
1024 "cp737",
1025 "cp775",
1026 "cp850",
1027 "cp852",
1028 "cp855",
1029 "cp856",
1030 "cp857",
1031 "cp860",
1032 "cp861",
1033 "cp862",
1034 "cp863",
1035 "cp864",
1036 "cp865",
1037 "cp866",
1038 "cp869",
1039 "cp874",
1040 "cp875",
1041 "cp932",
1042 "cp949",
1043 "cp950",
1044 "euc_jis_2004",
1045 "euc_jisx0213",
1046 "euc_jp",
1047 "euc_kr",
1048 "gb18030",
1049 "gb2312",
1050 "gbk",
1051 "hex_codec",
1052 "hp_roman8",
1053 "hz",
1054 "idna",
1055 "iso2022_jp",
1056 "iso2022_jp_1",
1057 "iso2022_jp_2",
1058 "iso2022_jp_2004",
1059 "iso2022_jp_3",
1060 "iso2022_jp_ext",
1061 "iso2022_kr",
1062 "iso8859_1",
1063 "iso8859_10",
1064 "iso8859_11",
1065 "iso8859_13",
1066 "iso8859_14",
1067 "iso8859_15",
1068 "iso8859_16",
1069 "iso8859_2",
1070 "iso8859_3",
1071 "iso8859_4",
1072 "iso8859_5",
1073 "iso8859_6",
1074 "iso8859_7",
1075 "iso8859_8",
1076 "iso8859_9",
1077 "johab",
1078 "koi8_r",
1079 "koi8_u",
1080 "latin_1",
1081 "mac_cyrillic",
1082 "mac_greek",
1083 "mac_iceland",
1084 "mac_latin2",
1085 "mac_roman",
1086 "mac_turkish",
1087 "palmos",
1088 "ptcp154",
1089 "punycode",
1090 "raw_unicode_escape",
1091 "rot_13",
1092 "shift_jis",
1093 "shift_jis_2004",
1094 "shift_jisx0213",
1095 "tis_620",
1096 "unicode_escape",
1097 "unicode_internal",
1098 "utf_16",
1099 "utf_16_be",
1100 "utf_16_le",
1101 "utf_7",
1102 "utf_8",
1103]
1104
1105if hasattr(codecs, "mbcs_encode"):
1106 all_unicode_encodings.append("mbcs")
1107
1108# The following encodings work only with str, not unicode
1109all_string_encodings = [
1110 "quopri_codec",
1111 "string_escape",
1112 "uu_codec",
1113]
1114
1115# The following encoding is not tested, because it's not supposed
1116# to work:
1117# "undefined"
1118
1119# The following encodings don't work in stateful mode
1120broken_unicode_with_streams = [
1121 "base64_codec",
1122 "hex_codec",
1123 "punycode",
1124 "unicode_internal"
1125]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001126broken_incremental_coders = broken_unicode_with_streams + [
1127 "idna",
1128 "zlib_codec",
1129 "bz2_codec",
1130]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001131
1132# The following encodings only support "strict" mode
1133only_strict_mode = [
1134 "idna",
1135 "zlib_codec",
1136 "bz2_codec",
1137]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001138
1139try:
1140 import bz2
1141except ImportError:
1142 pass
1143else:
1144 all_unicode_encodings.append("bz2_codec")
1145 broken_unicode_with_streams.append("bz2_codec")
1146
1147try:
1148 import zlib
1149except ImportError:
1150 pass
1151else:
1152 all_unicode_encodings.append("zlib_codec")
1153 broken_unicode_with_streams.append("zlib_codec")
1154
Walter Dörwald3abcb012007-04-16 22:10:50 +00001155class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001156 def test_basics(self):
1157 s = u"abc123" # all codecs should be able to encode these
1158 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001159 name = codecs.lookup(encoding).name
1160 if encoding.endswith("_codec"):
1161 name += "_codec"
1162 elif encoding == "latin_1":
1163 name = "latin_1"
1164 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001165 (bytes, size) = codecs.getencoder(encoding)(s)
1166 if encoding != "unicode_internal":
1167 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1168 (chars, size) = codecs.getdecoder(encoding)(bytes)
1169 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1170
1171 if encoding not in broken_unicode_with_streams:
1172 # check stream reader/writer
1173 q = Queue()
1174 writer = codecs.getwriter(encoding)(q)
1175 encodedresult = ""
1176 for c in s:
1177 writer.write(c)
1178 encodedresult += q.read()
1179 q = Queue()
1180 reader = codecs.getreader(encoding)(q)
1181 decodedresult = u""
1182 for c in encodedresult:
1183 q.write(c)
1184 decodedresult += reader.read()
1185 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1186
Thomas Wouters89f507f2006-12-13 04:49:30 +00001187 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001188 # check incremental decoder/encoder (fetched via the Python
1189 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001190 try:
1191 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001192 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001193 except LookupError: # no IncrementalEncoder
1194 pass
1195 else:
1196 # check incremental decoder/encoder
1197 encodedresult = ""
1198 for c in s:
1199 encodedresult += encoder.encode(c)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001200 encodedresult += encoder.encode(u"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001201 decoder = codecs.getincrementaldecoder(encoding)()
1202 decodedresult = u""
1203 for c in encodedresult:
1204 decodedresult += decoder.decode(c)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001205 decodedresult += decoder.decode("", True)
1206 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1207
1208 # check C API
1209 encodedresult = ""
1210 for c in s:
1211 encodedresult += cencoder.encode(c)
1212 encodedresult += cencoder.encode(u"", True)
1213 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1214 decodedresult = u""
1215 for c in encodedresult:
1216 decodedresult += cdecoder.decode(c)
1217 decodedresult += cdecoder.decode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001218 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1219
1220 # check iterencode()/iterdecode()
1221 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1222 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1223
1224 # check iterencode()/iterdecode() with empty string
1225 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1226 self.assertEqual(result, u"")
1227
Thomas Wouters89f507f2006-12-13 04:49:30 +00001228 if encoding not in only_strict_mode:
1229 # check incremental decoder/encoder with errors argument
1230 try:
1231 encoder = codecs.getincrementalencoder(encoding)("ignore")
1232 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1233 except LookupError: # no IncrementalEncoder
1234 pass
1235 else:
1236 encodedresult = "".join(encoder.encode(c) for c in s)
1237 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1238 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1239 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1240
1241 encodedresult = "".join(cencoder.encode(c) for c in s)
1242 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1243 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1244 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1245
Walter Dörwald729c31f2005-03-14 19:06:30 +00001246 def test_seek(self):
1247 # all codecs should be able to encode these
1248 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1249 for encoding in all_unicode_encodings:
1250 if encoding == "idna": # FIXME: See SF bug #1163178
1251 continue
1252 if encoding in broken_unicode_with_streams:
1253 continue
1254 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1255 for t in xrange(5):
1256 # Test that calling seek resets the internal codec state and buffers
1257 reader.seek(0, 0)
1258 line = reader.readline()
1259 self.assertEqual(s[:len(line)], line)
1260
Walter Dörwalde22d3392005-11-17 08:52:34 +00001261 def test_bad_decode_args(self):
1262 for encoding in all_unicode_encodings:
1263 decoder = codecs.getdecoder(encoding)
1264 self.assertRaises(TypeError, decoder)
1265 if encoding not in ("idna", "punycode"):
1266 self.assertRaises(TypeError, decoder, 42)
1267
1268 def test_bad_encode_args(self):
1269 for encoding in all_unicode_encodings:
1270 encoder = codecs.getencoder(encoding)
1271 self.assertRaises(TypeError, encoder)
1272
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001273 def test_encoding_map_type_initialized(self):
1274 from encodings import cp1140
1275 # This used to crash, we are only verifying there's no crash.
1276 table_type = type(cp1140.encoding_table)
1277 self.assertEqual(table_type, table_type)
1278
Walter Dörwald3abcb012007-04-16 22:10:50 +00001279 def test_decoder_state(self):
1280 # Check that getstate() and setstate() handle the state properly
1281 u = u"abc123"
1282 for encoding in all_unicode_encodings:
1283 if encoding not in broken_incremental_coders:
1284 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1285 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1286
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001287class BasicStrTest(unittest.TestCase):
1288 def test_basics(self):
1289 s = "abc123"
1290 for encoding in all_string_encodings:
1291 (bytes, size) = codecs.getencoder(encoding)(s)
1292 self.assertEqual(size, len(s))
1293 (chars, size) = codecs.getdecoder(encoding)(bytes)
1294 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1295
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001296class CharmapTest(unittest.TestCase):
1297 def test_decode_with_string_map(self):
1298 self.assertEquals(
1299 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1300 (u"abc", 3)
1301 )
1302
1303 self.assertEquals(
1304 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1305 (u"ab\ufffd", 3)
1306 )
1307
1308 self.assertEquals(
1309 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1310 (u"ab\ufffd", 3)
1311 )
1312
1313 self.assertEquals(
1314 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1315 (u"ab", 3)
1316 )
1317
1318 self.assertEquals(
1319 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1320 (u"ab", 3)
1321 )
1322
1323 allbytes = "".join(chr(i) for i in xrange(256))
1324 self.assertEquals(
1325 codecs.charmap_decode(allbytes, "ignore", u""),
1326 (u"", len(allbytes))
1327 )
1328
Thomas Wouters89f507f2006-12-13 04:49:30 +00001329class WithStmtTest(unittest.TestCase):
1330 def test_encodedfile(self):
1331 f = StringIO.StringIO("\xc3\xbc")
1332 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1333 self.assertEquals(ef.read(), "\xfc")
1334
1335 def test_streamreaderwriter(self):
1336 f = StringIO.StringIO("\xc3\xbc")
1337 info = codecs.lookup("utf-8")
1338 with codecs.StreamReaderWriter(f, info.streamreader,
1339 info.streamwriter, 'strict') as srw:
1340 self.assertEquals(srw.read(), u"\xfc")
1341
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001342
Fred Drake2e2be372001-09-20 21:33:42 +00001343def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001344 test_support.run_unittest(
1345 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001346 UTF16LETest,
1347 UTF16BETest,
1348 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001349 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001350 UTF7Test,
1351 UTF16ExTest,
1352 ReadBufferTest,
1353 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001354 EscapeDecodeTest,
1355 RecodingTest,
1356 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001357 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001358 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001359 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001360 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001361 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001362 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001363 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001364 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001365 BasicStrTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001366 CharmapTest,
1367 WithStmtTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001368 )
Fred Drake2e2be372001-09-20 21:33:42 +00001369
1370
1371if __name__ == "__main__":
1372 test_main()