blob: 9f734536ecadf38943f11bb34fbfb01d806813f6 [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Barry Warsaw04f357c2002-07-23 19:04:11 +00002import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00004import sys, _testcapi, io
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000010 def __init__(self, buffer):
11 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000012
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000019 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000020 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwald3abcb012007-04-16 22:10:50 +000026class MixInCheckStateHandling:
27 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000028 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000029 d = codecs.getincrementaldecoder(encoding)()
30 part1 = d.decode(s[:i])
31 state = d.getstate()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000032 self.assertTrue(isinstance(state[1], int))
Walter Dörwald3abcb012007-04-16 22:10:50 +000033 # Check that the condition stated in the documentation for
34 # IncrementalDecoder.getstate() holds
35 if not state[1]:
36 # reset decoder to the default state without anything buffered
37 d.setstate((state[0][:0], 0))
38 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000039 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000040 # The decoder must return to the same state
41 self.assertEqual(state, d.getstate())
42 # Create a new decoder and set it to the state
43 # we extracted from the old one
44 d = codecs.getincrementaldecoder(encoding)()
45 d.setstate(state)
46 part2 = d.decode(s[i:], True)
47 self.assertEqual(u, part1+part2)
48
49 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000050 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000051 d = codecs.getincrementalencoder(encoding)()
52 part1 = d.encode(u[:i])
53 state = d.getstate()
54 d = codecs.getincrementalencoder(encoding)()
55 d.setstate(state)
56 part2 = d.encode(u[i:], True)
57 self.assertEqual(s, part1+part2)
58
59class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000060 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000061 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000062 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000063 # the StreamReader and check that the results equal the appropriate
64 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000065 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +000066 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000067 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000068 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000069 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000070 result += r.read()
71 self.assertEqual(result, partialresult)
72 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000073 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000074 self.assertEqual(r.bytebuffer, b"")
Guido van Rossumef87d6e2007-05-02 19:09:54 +000075 self.assertEqual(r.charbuffer, "")
Walter Dörwald69652032004-09-07 20:24:22 +000076
Thomas Woutersa9773292006-04-21 09:43:23 +000077 # do the check again, this time using a incremental decoder
78 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000079 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000080 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000081 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000082 self.assertEqual(result, partialresult)
83 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000084 self.assertEqual(d.decode(b"", True), "")
85 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000086
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000088 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000089 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000090 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000091 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000092 self.assertEqual(result, partialresult)
93 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000094 self.assertEqual(d.decode(b"", True), "")
95 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000096
97 # check iterdecode()
98 encoded = input.encode(self.encoding)
99 self.assertEqual(
100 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000102 )
103
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000104 def test_readline(self):
105 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000106 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000107 return codecs.getreader(self.encoding)(stream)
108
Walter Dörwaldca199432006-03-06 22:39:12 +0000109 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000110 reader = getreader(input)
111 lines = []
112 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000113 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000114 if not line:
115 break
116 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000117 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000118
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000119 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
120 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
121 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000122 self.assertEqual(readalllines(s, True), sexpected)
123 self.assertEqual(readalllines(s, False), sexpectednoends)
124 self.assertEqual(readalllines(s, True, 10), sexpected)
125 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000126
127 # Test long lines (multiple calls to read() in readline())
128 vw = []
129 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000130 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
131 vw.append((i*200)*"\3042" + lineend)
132 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000133 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
134 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
135
136 # Test lines where the first read might end with \r, so the
137 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000138 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000139 for lineend in "\n \r\n \r \u2028".split():
140 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000141 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000142 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000143 self.assertEqual(
144 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000145 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000146 )
147 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000148 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000149 self.assertEqual(
150 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000151 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000152 )
153
154 def test_bug1175396(self):
155 s = [
156 '<%!--===================================================\r\n',
157 ' BLOG index page: show recent articles,\r\n',
158 ' today\'s articles, or articles of a specific date.\r\n',
159 '========================================================--%>\r\n',
160 '<%@inputencoding="ISO-8859-1"%>\r\n',
161 '<%@pagetemplate=TEMPLATE.y%>\r\n',
162 '<%@import=import frog.util, frog%>\r\n',
163 '<%@import=import frog.objects%>\r\n',
164 '<%@import=from frog.storageerrors import StorageError%>\r\n',
165 '<%\r\n',
166 '\r\n',
167 'import logging\r\n',
168 'log=logging.getLogger("Snakelets.logger")\r\n',
169 '\r\n',
170 '\r\n',
171 'user=self.SessionCtx.user\r\n',
172 'storageEngine=self.SessionCtx.storageEngine\r\n',
173 '\r\n',
174 '\r\n',
175 'def readArticlesFromDate(date, count=None):\r\n',
176 ' entryids=storageEngine.listBlogEntries(date)\r\n',
177 ' entryids.reverse() # descending\r\n',
178 ' if count:\r\n',
179 ' entryids=entryids[:count]\r\n',
180 ' try:\r\n',
181 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
182 ' except StorageError,x:\r\n',
183 ' log.error("Error loading articles: "+str(x))\r\n',
184 ' self.abort("cannot load articles")\r\n',
185 '\r\n',
186 'showdate=None\r\n',
187 '\r\n',
188 'arg=self.Request.getArg()\r\n',
189 'if arg=="today":\r\n',
190 ' #-------------------- TODAY\'S ARTICLES\r\n',
191 ' self.write("<h2>Today\'s articles</h2>")\r\n',
192 ' showdate = frog.util.isodatestr() \r\n',
193 ' entries = readArticlesFromDate(showdate)\r\n',
194 'elif arg=="active":\r\n',
195 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
196 ' self.Yredirect("active.y")\r\n',
197 'elif arg=="login":\r\n',
198 ' #-------------------- LOGIN PAGE redirect\r\n',
199 ' self.Yredirect("login.y")\r\n',
200 'elif arg=="date":\r\n',
201 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
202 ' showdate = self.Request.getParameter("date")\r\n',
203 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
204 ' entries = readArticlesFromDate(showdate)\r\n',
205 'else:\r\n',
206 ' #-------------------- RECENT ARTICLES\r\n',
207 ' self.write("<h2>Recent articles</h2>")\r\n',
208 ' dates=storageEngine.listBlogEntryDates()\r\n',
209 ' if dates:\r\n',
210 ' entries=[]\r\n',
211 ' SHOWAMOUNT=10\r\n',
212 ' for showdate in dates:\r\n',
213 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
214 ' if len(entries)>=SHOWAMOUNT:\r\n',
215 ' break\r\n',
216 ' \r\n',
217 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000218 stream = io.BytesIO("".join(s).encode(self.encoding))
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000219 reader = codecs.getreader(self.encoding)(stream)
220 for (i, line) in enumerate(reader):
221 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000222
223 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000224 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000225 writer = codecs.getwriter(self.encoding)(q)
226 reader = codecs.getreader(self.encoding)(q)
227
228 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000229 writer.write("foo\r")
230 self.assertEqual(reader.readline(keepends=False), "foo")
231 writer.write("\nbar\r")
232 self.assertEqual(reader.readline(keepends=False), "")
233 self.assertEqual(reader.readline(keepends=False), "bar")
234 writer.write("baz")
235 self.assertEqual(reader.readline(keepends=False), "baz")
236 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000237
238 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000239 writer.write("foo\r")
240 self.assertEqual(reader.readline(keepends=True), "foo\r")
241 writer.write("\nbar\r")
242 self.assertEqual(reader.readline(keepends=True), "\n")
243 self.assertEqual(reader.readline(keepends=True), "bar\r")
244 writer.write("baz")
245 self.assertEqual(reader.readline(keepends=True), "baz")
246 self.assertEqual(reader.readline(keepends=True), "")
247 writer.write("foo\r\n")
248 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000249
Walter Dörwald9fa09462005-01-10 12:01:39 +0000250 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000251 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
252 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
253 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000254
255 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000256 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000257 reader = codecs.getreader(self.encoding)(stream)
258 self.assertEqual(reader.readline(), s1)
259 self.assertEqual(reader.readline(), s2)
260 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000261 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000262
263 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000264 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
265 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
266 s3 = "stillokay:bbbbxx\r\n"
267 s4 = "broken!!!!badbad\r\n"
268 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000269
270 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000271 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000272 reader = codecs.getreader(self.encoding)(stream)
273 self.assertEqual(reader.readline(), s1)
274 self.assertEqual(reader.readline(), s2)
275 self.assertEqual(reader.readline(), s3)
276 self.assertEqual(reader.readline(), s4)
277 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000278 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000279
Walter Dörwald41980ca2007-08-16 21:55:45 +0000280class UTF32Test(ReadTest):
281 encoding = "utf-32"
282
283 spamle = (b'\xff\xfe\x00\x00'
284 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
285 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
286 spambe = (b'\x00\x00\xfe\xff'
287 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
288 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
289
290 def test_only_one_bom(self):
291 _,_,reader,writer = codecs.lookup(self.encoding)
292 # encode some stream
293 s = io.BytesIO()
294 f = writer(s)
295 f.write("spam")
296 f.write("spam")
297 d = s.getvalue()
298 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000299 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000300 # try to read it back
301 s = io.BytesIO(d)
302 f = reader(s)
303 self.assertEquals(f.read(), "spamspam")
304
305 def test_badbom(self):
306 s = io.BytesIO(4*b"\xff")
307 f = codecs.getreader(self.encoding)(s)
308 self.assertRaises(UnicodeError, f.read)
309
310 s = io.BytesIO(8*b"\xff")
311 f = codecs.getreader(self.encoding)(s)
312 self.assertRaises(UnicodeError, f.read)
313
314 def test_partial(self):
315 self.check_partial(
316 "\x00\xff\u0100\uffff",
317 [
318 "", # first byte of BOM read
319 "", # second byte of BOM read
320 "", # third byte of BOM read
321 "", # fourth byte of BOM read => byteorder known
322 "",
323 "",
324 "",
325 "\x00",
326 "\x00",
327 "\x00",
328 "\x00",
329 "\x00\xff",
330 "\x00\xff",
331 "\x00\xff",
332 "\x00\xff",
333 "\x00\xff\u0100",
334 "\x00\xff\u0100",
335 "\x00\xff\u0100",
336 "\x00\xff\u0100",
337 "\x00\xff\u0100\uffff",
338 ]
339 )
340
341 def test_errors(self):
342 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
343 b"\xff", "strict", True)
344
345 def test_decoder_state(self):
346 self.check_state_handling_decode(self.encoding,
347 "spamspam", self.spamle)
348 self.check_state_handling_decode(self.encoding,
349 "spamspam", self.spambe)
350
351class UTF32LETest(ReadTest):
352 encoding = "utf-32-le"
353
354 def test_partial(self):
355 self.check_partial(
356 "\x00\xff\u0100\uffff",
357 [
358 "",
359 "",
360 "",
361 "\x00",
362 "\x00",
363 "\x00",
364 "\x00",
365 "\x00\xff",
366 "\x00\xff",
367 "\x00\xff",
368 "\x00\xff",
369 "\x00\xff\u0100",
370 "\x00\xff\u0100",
371 "\x00\xff\u0100",
372 "\x00\xff\u0100",
373 "\x00\xff\u0100\uffff",
374 ]
375 )
376
377 def test_simple(self):
378 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
379
380 def test_errors(self):
381 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
382 b"\xff", "strict", True)
383
384class UTF32BETest(ReadTest):
385 encoding = "utf-32-be"
386
387 def test_partial(self):
388 self.check_partial(
389 "\x00\xff\u0100\uffff",
390 [
391 "",
392 "",
393 "",
394 "\x00",
395 "\x00",
396 "\x00",
397 "\x00",
398 "\x00\xff",
399 "\x00\xff",
400 "\x00\xff",
401 "\x00\xff",
402 "\x00\xff\u0100",
403 "\x00\xff\u0100",
404 "\x00\xff\u0100",
405 "\x00\xff\u0100",
406 "\x00\xff\u0100\uffff",
407 ]
408 )
409
410 def test_simple(self):
411 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
412
413 def test_errors(self):
414 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
415 b"\xff", "strict", True)
416
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000417class UTF16Test(ReadTest):
418 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000419
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000420 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
421 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000422
423 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000424 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000425 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000426 s = io.BytesIO()
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000427 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000428 f.write("spam")
429 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000430 d = s.getvalue()
431 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000432 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000433 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000434 s = io.BytesIO(d)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000435 f = reader(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000436 self.assertEquals(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000437
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000438 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000439 s = io.BytesIO(b"\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000440 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000441 self.assertRaises(UnicodeError, f.read)
442
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000443 s = io.BytesIO(b"\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000444 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000445 self.assertRaises(UnicodeError, f.read)
446
Walter Dörwald69652032004-09-07 20:24:22 +0000447 def test_partial(self):
448 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000449 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000450 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000451 "", # first byte of BOM read
452 "", # second byte of BOM read => byteorder known
453 "",
454 "\x00",
455 "\x00",
456 "\x00\xff",
457 "\x00\xff",
458 "\x00\xff\u0100",
459 "\x00\xff\u0100",
460 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000461 ]
462 )
463
Walter Dörwalde22d3392005-11-17 08:52:34 +0000464 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000465 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000466 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000467
468 def test_decoder_state(self):
469 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000470 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000471 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000472 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000473
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000474class UTF16LETest(ReadTest):
475 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000476
477 def test_partial(self):
478 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000479 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000480 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000481 "",
482 "\x00",
483 "\x00",
484 "\x00\xff",
485 "\x00\xff",
486 "\x00\xff\u0100",
487 "\x00\xff\u0100",
488 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000489 ]
490 )
491
Walter Dörwalde22d3392005-11-17 08:52:34 +0000492 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000493 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000494 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000495
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000496class UTF16BETest(ReadTest):
497 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000498
499 def test_partial(self):
500 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000501 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000502 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000503 "",
504 "\x00",
505 "\x00",
506 "\x00\xff",
507 "\x00\xff",
508 "\x00\xff\u0100",
509 "\x00\xff\u0100",
510 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000511 ]
512 )
513
Walter Dörwalde22d3392005-11-17 08:52:34 +0000514 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000515 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000516 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000517
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000518class UTF8Test(ReadTest):
519 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000520
521 def test_partial(self):
522 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000523 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000524 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000525 "\x00",
526 "\x00",
527 "\x00\xff",
528 "\x00\xff",
529 "\x00\xff\u07ff",
530 "\x00\xff\u07ff",
531 "\x00\xff\u07ff",
532 "\x00\xff\u07ff\u0800",
533 "\x00\xff\u07ff\u0800",
534 "\x00\xff\u07ff\u0800",
535 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000536 ]
537 )
538
Walter Dörwald3abcb012007-04-16 22:10:50 +0000539 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000540 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000541 self.check_state_handling_decode(self.encoding,
542 u, u.encode(self.encoding))
543
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000544 def test_lone_surrogates(self):
545 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
546 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
547
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000548 def test_surrogatepass_handler(self):
549 self.assertEquals("abc\ud800def".encode("utf-8", "surrogatepass"),
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000550 b"abc\xed\xa0\x80def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000551 self.assertEquals(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000552 "abc\ud800def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000553 self.assertTrue(codecs.lookup_error("surrogatepass"))
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000554
Walter Dörwalde22d3392005-11-17 08:52:34 +0000555class UTF7Test(ReadTest):
556 encoding = "utf-7"
557
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000558 def test_partial(self):
559 self.check_partial(
560 "a+-b",
561 [
562 "a",
563 "a",
564 "a+",
565 "a+-",
566 "a+-b",
567 ]
568 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000569
570class UTF16ExTest(unittest.TestCase):
571
572 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000573 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000574
575 def test_bad_args(self):
576 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
577
578class ReadBufferTest(unittest.TestCase):
579
580 def test_array(self):
581 import array
582 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000583 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000584 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000585 )
586
587 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000588 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000589
590 def test_bad_args(self):
591 self.assertRaises(TypeError, codecs.readbuffer_encode)
592 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
593
594class CharBufferTest(unittest.TestCase):
595
596 def test_string(self):
Guido van Rossum09549f42007-08-27 20:40:10 +0000597 self.assertEqual(codecs.charbuffer_encode(b"spam"), (b"spam", 4))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000598
599 def test_empty(self):
Guido van Rossum09549f42007-08-27 20:40:10 +0000600 self.assertEqual(codecs.charbuffer_encode(b""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000601
602 def test_bad_args(self):
603 self.assertRaises(TypeError, codecs.charbuffer_encode)
604 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
605
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000606class UTF8SigTest(ReadTest):
607 encoding = "utf-8-sig"
608
609 def test_partial(self):
610 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000611 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000612 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000613 "",
614 "",
615 "", # First BOM has been read and skipped
616 "",
617 "",
618 "\ufeff", # Second BOM has been read and emitted
619 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000620 "\ufeff\x00", # First byte of encoded "\xff" read
621 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
622 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
623 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000624 "\ufeff\x00\xff\u07ff",
625 "\ufeff\x00\xff\u07ff",
626 "\ufeff\x00\xff\u07ff\u0800",
627 "\ufeff\x00\xff\u07ff\u0800",
628 "\ufeff\x00\xff\u07ff\u0800",
629 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000630 ]
631 )
632
Thomas Wouters89f507f2006-12-13 04:49:30 +0000633 def test_bug1601501(self):
634 # SF bug #1601501: check that the codec works with a buffer
Antoine Pitrou616d2852008-08-19 22:09:34 +0000635 self.assertEquals(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000636
Walter Dörwald3abcb012007-04-16 22:10:50 +0000637 def test_bom(self):
638 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000639 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000640 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
641
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000642 def test_stream_bom(self):
643 unistring = "ABC\u00A1\u2200XYZ"
644 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
645
646 reader = codecs.getreader("utf-8-sig")
647 for sizehint in [None] + list(range(1, 11)) + \
648 [64, 128, 256, 512, 1024]:
649 istream = reader(io.BytesIO(bytestring))
650 ostream = io.StringIO()
651 while 1:
652 if sizehint is not None:
653 data = istream.read(sizehint)
654 else:
655 data = istream.read()
656
657 if not data:
658 break
659 ostream.write(data)
660
661 got = ostream.getvalue()
662 self.assertEqual(got, unistring)
663
664 def test_stream_bare(self):
665 unistring = "ABC\u00A1\u2200XYZ"
666 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
667
668 reader = codecs.getreader("utf-8-sig")
669 for sizehint in [None] + list(range(1, 11)) + \
670 [64, 128, 256, 512, 1024]:
671 istream = reader(io.BytesIO(bytestring))
672 ostream = io.StringIO()
673 while 1:
674 if sizehint is not None:
675 data = istream.read(sizehint)
676 else:
677 data = istream.read()
678
679 if not data:
680 break
681 ostream.write(data)
682
683 got = ostream.getvalue()
684 self.assertEqual(got, unistring)
685
686class EscapeDecodeTest(unittest.TestCase):
687 def test_empty(self):
688 self.assertEquals(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000689
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000690class RecodingTest(unittest.TestCase):
691 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000692 f = io.BytesIO()
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000693 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000694 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000695 f2.close()
696 # Python used to crash on this at exit because of a refcount
697 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000698
Martin v. Löwis2548c732003-04-18 10:39:54 +0000699# From RFC 3492
700punycode_testcases = [
701 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000702 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
703 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000704 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000705 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000706 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000707 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000708 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000709 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000710 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000711 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000712 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
713 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
714 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000715 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000716 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000717 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
718 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
719 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000720 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000721 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000722 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000723 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
724 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
725 "\u0939\u0948\u0902",
726 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000727
728 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000729 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000730 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
731 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000732
733 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000734 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
735 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
736 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000737 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
738 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000739
740 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000741 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
742 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
743 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
744 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000745 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000746
747 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000748 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
749 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
750 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
751 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
752 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000753 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000754
755 # (K) Vietnamese:
756 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
757 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000758 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
759 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
760 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
761 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000762 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000763
Martin v. Löwis2548c732003-04-18 10:39:54 +0000764 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000765 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000766 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000767
Martin v. Löwis2548c732003-04-18 10:39:54 +0000768 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000769 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
770 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
771 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000772 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000773
774 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000775 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
776 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
777 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000778 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000779
780 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000781 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000782 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000783
784 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000785 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
786 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000787 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000788
789 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000790 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000791 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000792
793 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000794 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000795 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000796
797 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000798 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
799 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000800 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000801 ]
802
803for i in punycode_testcases:
804 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000805 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000806
807class PunycodeTest(unittest.TestCase):
808 def test_encode(self):
809 for uni, puny in punycode_testcases:
810 # Need to convert both strings to lower case, since
811 # some of the extended encodings use upper case, but our
812 # code produces only lower case. Converting just puny to
813 # lower is also insufficient, since some of the input characters
814 # are upper case.
Walter Dörwalda4c61282007-05-10 12:36:25 +0000815 self.assertEquals(
816 str(uni.encode("punycode"), "ascii").lower(),
817 str(puny, "ascii").lower()
818 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000819
820 def test_decode(self):
821 for uni, puny in punycode_testcases:
822 self.assertEquals(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000823 puny = puny.decode("ascii").encode("ascii")
824 self.assertEquals(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000825
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000826class UnicodeInternalTest(unittest.TestCase):
827 def test_bug1251300(self):
828 # Decoding with unicode_internal used to not correctly handle "code
829 # points" above 0x10ffff on UCS-4 builds.
830 if sys.maxunicode > 0xffff:
831 ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000832 (b"\x00\x10\xff\xff", "\U0010ffff"),
833 (b"\x00\x00\x01\x01", "\U00000101"),
834 (b"", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000835 ]
836 not_ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000837 b"\x7f\xff\xff\xff",
838 b"\x80\x00\x00\x00",
839 b"\x81\x00\x00\x00",
840 b"\x00",
841 b"\x00\x00\x00\x00\x00",
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000842 ]
843 for internal, uni in ok:
844 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000845 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000846 self.assertEquals(uni, internal.decode("unicode_internal"))
847 for internal in not_ok:
848 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000849 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000850 self.assertRaises(UnicodeDecodeError, internal.decode,
851 "unicode_internal")
852
853 def test_decode_error_attributes(self):
854 if sys.maxunicode > 0xffff:
855 try:
Walter Dörwald092a2252007-06-07 11:26:16 +0000856 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +0000857 except UnicodeDecodeError as ex:
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000858 self.assertEquals("unicode_internal", ex.encoding)
Walter Dörwald092a2252007-06-07 11:26:16 +0000859 self.assertEquals(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000860 self.assertEquals(4, ex.start)
861 self.assertEquals(8, ex.end)
862 else:
863 self.fail()
864
865 def test_decode_callback(self):
866 if sys.maxunicode > 0xffff:
867 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
868 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossum98297ee2007-11-06 21:34:58 +0000869 ab = "ab".encode("unicode_internal").decode()
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000870 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
871 "ascii"),
872 "UnicodeInternalTest")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000873 self.assertEquals(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000874
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000875 def test_encode_length(self):
876 # Issue 3739
877 encoder = codecs.getencoder("unicode_internal")
878 self.assertEquals(encoder("a")[1], 1)
879 self.assertEquals(encoder("\xe9\u0142")[1], 2)
880
Martin v. Löwis2548c732003-04-18 10:39:54 +0000881# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
882nameprep_tests = [
883 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000884 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
885 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
886 b'\xb8\x8f\xef\xbb\xbf',
887 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000888 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000889 (b'CAFE',
890 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000891 # 3.3 Case folding 8bit U+00DF (german sharp s).
892 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000893 (b'\xc3\x9f',
894 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000895 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000896 (b'\xc4\xb0',
897 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000898 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000899 (b'\xc5\x83\xcd\xba',
900 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000901 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
902 # XXX: skip this as it fails in UCS-2 mode
903 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
904 # 'telc\xe2\x88\x95kg\xcf\x83'),
905 (None, None),
906 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000907 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
908 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000909 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000910 (b'\xe1\xbe\xb7',
911 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000912 # 3.9 Self-reverting case folding U+01F0 and normalization.
913 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000914 (b'\xc7\xb0',
915 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000916 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000917 (b'\xce\x90',
918 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000919 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000920 (b'\xce\xb0',
921 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000922 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000923 (b'\xe1\xba\x96',
924 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000925 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000926 (b'\xe1\xbd\x96',
927 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000928 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000929 (b' ',
930 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000931 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000932 (b'\xc2\xa0',
933 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000934 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000935 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000936 None),
937 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000938 (b'\xe2\x80\x80',
939 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000940 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000941 (b'\xe2\x80\x8b',
942 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000943 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000944 (b'\xe3\x80\x80',
945 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000946 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000947 (b'\x10\x7f',
948 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000949 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000950 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000951 None),
952 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000953 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000954 None),
955 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000956 (b'\xef\xbb\xbf',
957 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000958 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000959 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000960 None),
961 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000962 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000963 None),
964 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000965 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000966 None),
967 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000968 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000969 None),
970 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000971 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000972 None),
973 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000974 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000975 None),
976 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000977 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000978 None),
979 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000980 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000981 None),
982 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000983 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000984 None),
985 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000986 (b'\xcd\x81',
987 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000988 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000989 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000990 None),
991 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000992 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000993 None),
994 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000995 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000996 None),
997 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000998 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000999 None),
1000 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001001 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001002 None),
1003 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001004 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001005 None),
1006 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001007 (b'foo\xef\xb9\xb6bar',
1008 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001009 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001010 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001011 None),
1012 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001013 (b'\xd8\xa71\xd8\xa8',
1014 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001015 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001016 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001017 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001018 # None),
1019 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001020 # 3.44 Larger test (shrinking).
1021 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001022 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1023 b'\xaa\xce\xb0\xe2\x80\x80',
1024 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001025 # 3.45 Larger test (expanding).
1026 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001027 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1028 b'\x80',
1029 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1030 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1031 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001032 ]
1033
1034
1035class NameprepTest(unittest.TestCase):
1036 def test_nameprep(self):
1037 from encodings.idna import nameprep
1038 for pos, (orig, prepped) in enumerate(nameprep_tests):
1039 if orig is None:
1040 # Skipped
1041 continue
1042 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001043 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001044 if prepped is None:
1045 # Input contains prohibited characters
1046 self.assertRaises(UnicodeError, nameprep, orig)
1047 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001048 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001049 try:
1050 self.assertEquals(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001051 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001052 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001053
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001054class IDNACodecTest(unittest.TestCase):
1055 def test_builtin_decode(self):
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001056 self.assertEquals(str(b"python.org", "idna"), "python.org")
1057 self.assertEquals(str(b"python.org.", "idna"), "python.org.")
1058 self.assertEquals(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1059 self.assertEquals(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001060
1061 def test_builtin_encode(self):
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001062 self.assertEquals("python.org".encode("idna"), b"python.org")
1063 self.assertEquals("python.org.".encode("idna"), b"python.org.")
1064 self.assertEquals("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1065 self.assertEquals("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001066
Martin v. Löwis8b595142005-08-25 11:03:38 +00001067 def test_stream(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001068 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001069 r.read(3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001070 self.assertEquals(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001071
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001072 def test_incremental_decode(self):
1073 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001074 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001075 "python.org"
1076 )
1077 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001078 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001079 "python.org."
1080 )
1081 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001082 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001083 "pyth\xf6n.org."
1084 )
1085 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001086 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001087 "pyth\xf6n.org."
1088 )
1089
1090 decoder = codecs.getincrementaldecoder("idna")()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001091 self.assertEquals(decoder.decode(b"xn--xam", ), "")
1092 self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1093 self.assertEquals(decoder.decode(b"rg"), "")
1094 self.assertEquals(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001095
1096 decoder.reset()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001097 self.assertEquals(decoder.decode(b"xn--xam", ), "")
1098 self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1099 self.assertEquals(decoder.decode(b"rg."), "org.")
1100 self.assertEquals(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001101
1102 def test_incremental_encode(self):
1103 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001104 b"".join(codecs.iterencode("python.org", "idna")),
1105 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001106 )
1107 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001108 b"".join(codecs.iterencode("python.org.", "idna")),
1109 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001110 )
1111 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001112 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1113 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001114 )
1115 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001116 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1117 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001118 )
1119
1120 encoder = codecs.getincrementalencoder("idna")()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001121 self.assertEquals(encoder.encode("\xe4x"), b"")
1122 self.assertEquals(encoder.encode("ample.org"), b"xn--xample-9ta.")
1123 self.assertEquals(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001124
1125 encoder.reset()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001126 self.assertEquals(encoder.encode("\xe4x"), b"")
1127 self.assertEquals(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1128 self.assertEquals(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001129
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001130class CodecsModuleTest(unittest.TestCase):
1131
1132 def test_decode(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001133 self.assertEquals(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001134 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001135 self.assertRaises(TypeError, codecs.decode)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001136 self.assertEquals(codecs.decode(b'abc'), 'abc')
1137 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001138
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001139 def test_encode(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001140 self.assertEquals(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001141 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001142 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001143 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001144 self.assertEquals(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001145 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001146
1147 def test_register(self):
1148 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001149 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001150
1151 def test_lookup(self):
1152 self.assertRaises(TypeError, codecs.lookup)
1153 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001154 self.assertRaises(LookupError, codecs.lookup, " ")
1155
1156 def test_getencoder(self):
1157 self.assertRaises(TypeError, codecs.getencoder)
1158 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1159
1160 def test_getdecoder(self):
1161 self.assertRaises(TypeError, codecs.getdecoder)
1162 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1163
1164 def test_getreader(self):
1165 self.assertRaises(TypeError, codecs.getreader)
1166 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1167
1168 def test_getwriter(self):
1169 self.assertRaises(TypeError, codecs.getwriter)
1170 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001171
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001172class StreamReaderTest(unittest.TestCase):
1173
1174 def setUp(self):
1175 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001176 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001177
1178 def test_readlines(self):
1179 f = self.reader(self.stream)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001180 self.assertEquals(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001181
Thomas Wouters89f507f2006-12-13 04:49:30 +00001182class EncodedFileTest(unittest.TestCase):
1183
1184 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001185 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001186 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001187 self.assertEquals(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001188
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001189 f = io.BytesIO()
Thomas Wouters89f507f2006-12-13 04:49:30 +00001190 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001191 ef.write(b'\xc3\xbc')
1192 self.assertEquals(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001193
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001194all_unicode_encodings = [
1195 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001196 "big5",
1197 "big5hkscs",
1198 "charmap",
1199 "cp037",
1200 "cp1006",
1201 "cp1026",
1202 "cp1140",
1203 "cp1250",
1204 "cp1251",
1205 "cp1252",
1206 "cp1253",
1207 "cp1254",
1208 "cp1255",
1209 "cp1256",
1210 "cp1257",
1211 "cp1258",
1212 "cp424",
1213 "cp437",
1214 "cp500",
1215 "cp737",
1216 "cp775",
1217 "cp850",
1218 "cp852",
1219 "cp855",
1220 "cp856",
1221 "cp857",
1222 "cp860",
1223 "cp861",
1224 "cp862",
1225 "cp863",
1226 "cp864",
1227 "cp865",
1228 "cp866",
1229 "cp869",
1230 "cp874",
1231 "cp875",
1232 "cp932",
1233 "cp949",
1234 "cp950",
1235 "euc_jis_2004",
1236 "euc_jisx0213",
1237 "euc_jp",
1238 "euc_kr",
1239 "gb18030",
1240 "gb2312",
1241 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001242 "hp_roman8",
1243 "hz",
1244 "idna",
1245 "iso2022_jp",
1246 "iso2022_jp_1",
1247 "iso2022_jp_2",
1248 "iso2022_jp_2004",
1249 "iso2022_jp_3",
1250 "iso2022_jp_ext",
1251 "iso2022_kr",
1252 "iso8859_1",
1253 "iso8859_10",
1254 "iso8859_11",
1255 "iso8859_13",
1256 "iso8859_14",
1257 "iso8859_15",
1258 "iso8859_16",
1259 "iso8859_2",
1260 "iso8859_3",
1261 "iso8859_4",
1262 "iso8859_5",
1263 "iso8859_6",
1264 "iso8859_7",
1265 "iso8859_8",
1266 "iso8859_9",
1267 "johab",
1268 "koi8_r",
1269 "koi8_u",
1270 "latin_1",
1271 "mac_cyrillic",
1272 "mac_greek",
1273 "mac_iceland",
1274 "mac_latin2",
1275 "mac_roman",
1276 "mac_turkish",
1277 "palmos",
1278 "ptcp154",
1279 "punycode",
1280 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001281 "shift_jis",
1282 "shift_jis_2004",
1283 "shift_jisx0213",
1284 "tis_620",
1285 "unicode_escape",
1286 "unicode_internal",
1287 "utf_16",
1288 "utf_16_be",
1289 "utf_16_le",
1290 "utf_7",
1291 "utf_8",
1292]
1293
1294if hasattr(codecs, "mbcs_encode"):
1295 all_unicode_encodings.append("mbcs")
1296
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001297# The following encoding is not tested, because it's not supposed
1298# to work:
1299# "undefined"
1300
1301# The following encodings don't work in stateful mode
1302broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001303 "punycode",
1304 "unicode_internal"
1305]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001306broken_incremental_coders = broken_unicode_with_streams + [
1307 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001308]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001309
1310# The following encodings only support "strict" mode
1311only_strict_mode = [
1312 "idna",
Thomas Wouters89f507f2006-12-13 04:49:30 +00001313]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001314
Walter Dörwald3abcb012007-04-16 22:10:50 +00001315class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001316 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001317 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001318 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001319 name = codecs.lookup(encoding).name
1320 if encoding.endswith("_codec"):
1321 name += "_codec"
1322 elif encoding == "latin_1":
1323 name = "latin_1"
1324 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001325 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001326 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001327 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001328 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1329
1330 if encoding not in broken_unicode_with_streams:
1331 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001332 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001333 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001334 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001335 for c in s:
1336 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001337 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001338 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001339 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001340 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001341 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001342 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001343 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001344 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001345 decodedresult += reader.read()
1346 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1347
Thomas Wouters89f507f2006-12-13 04:49:30 +00001348 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001349 # check incremental decoder/encoder (fetched via the Python
1350 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001351 try:
1352 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001353 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001354 except LookupError: # no IncrementalEncoder
1355 pass
1356 else:
1357 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001358 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001359 for c in s:
1360 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001361 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001362 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001363 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001364 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001365 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001366 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001367 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1368
1369 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001370 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001371 for c in s:
1372 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001373 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001374 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001375 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001376 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001377 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001378 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001379 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1380
1381 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001382 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001383 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1384
1385 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001386 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1387 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001388
Thomas Wouters89f507f2006-12-13 04:49:30 +00001389 if encoding not in only_strict_mode:
1390 # check incremental decoder/encoder with errors argument
1391 try:
1392 encoder = codecs.getincrementalencoder(encoding)("ignore")
1393 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1394 except LookupError: # no IncrementalEncoder
1395 pass
1396 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001397 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001398 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001399 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001400 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1401
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001402 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001403 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001404 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001405 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1406
Walter Dörwald729c31f2005-03-14 19:06:30 +00001407 def test_seek(self):
1408 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001409 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001410 for encoding in all_unicode_encodings:
1411 if encoding == "idna": # FIXME: See SF bug #1163178
1412 continue
1413 if encoding in broken_unicode_with_streams:
1414 continue
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001415 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001416 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001417 # Test that calling seek resets the internal codec state and buffers
1418 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001419 data = reader.read()
1420 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001421
Walter Dörwalde22d3392005-11-17 08:52:34 +00001422 def test_bad_decode_args(self):
1423 for encoding in all_unicode_encodings:
1424 decoder = codecs.getdecoder(encoding)
1425 self.assertRaises(TypeError, decoder)
1426 if encoding not in ("idna", "punycode"):
1427 self.assertRaises(TypeError, decoder, 42)
1428
1429 def test_bad_encode_args(self):
1430 for encoding in all_unicode_encodings:
1431 encoder = codecs.getencoder(encoding)
1432 self.assertRaises(TypeError, encoder)
1433
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001434 def test_encoding_map_type_initialized(self):
1435 from encodings import cp1140
1436 # This used to crash, we are only verifying there's no crash.
1437 table_type = type(cp1140.encoding_table)
1438 self.assertEqual(table_type, table_type)
1439
Walter Dörwald3abcb012007-04-16 22:10:50 +00001440 def test_decoder_state(self):
1441 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001442 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001443 for encoding in all_unicode_encodings:
1444 if encoding not in broken_incremental_coders:
1445 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1446 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1447
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001448class CharmapTest(unittest.TestCase):
1449 def test_decode_with_string_map(self):
1450 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001451 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001452 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001453 )
1454
1455 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001456 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001457 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001458 )
1459
1460 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001461 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001462 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001463 )
1464
1465 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001466 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001467 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001468 )
1469
1470 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001471 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001472 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001473 )
1474
Guido van Rossum805365e2007-05-07 22:24:25 +00001475 allbytes = bytes(range(256))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001476 self.assertEquals(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001477 codecs.charmap_decode(allbytes, "ignore", ""),
1478 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001479 )
1480
Thomas Wouters89f507f2006-12-13 04:49:30 +00001481class WithStmtTest(unittest.TestCase):
1482 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001483 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001484 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001485 self.assertEquals(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001486
1487 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001488 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001489 info = codecs.lookup("utf-8")
1490 with codecs.StreamReaderWriter(f, info.streamreader,
1491 info.streamwriter, 'strict') as srw:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001492 self.assertEquals(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001493
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001494class TypesTest(unittest.TestCase):
1495 def test_decode_unicode(self):
1496 # Most decoders don't accept unicode input
1497 decoders = [
1498 codecs.utf_7_decode,
1499 codecs.utf_8_decode,
1500 codecs.utf_16_le_decode,
1501 codecs.utf_16_be_decode,
1502 codecs.utf_16_ex_decode,
1503 codecs.utf_32_decode,
1504 codecs.utf_32_le_decode,
1505 codecs.utf_32_be_decode,
1506 codecs.utf_32_ex_decode,
1507 codecs.latin_1_decode,
1508 codecs.ascii_decode,
1509 codecs.charmap_decode,
1510 ]
1511 if hasattr(codecs, "mbcs_decode"):
1512 decoders.append(codecs.mbcs_decode)
1513 for decoder in decoders:
1514 self.assertRaises(TypeError, decoder, "xxx")
1515
1516 def test_unicode_escape(self):
1517 # Escape-decoding an unicode string is supported ang gives the same
1518 # result as decoding the equivalent ASCII bytes string.
1519 self.assertEquals(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1520 self.assertEquals(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1521 self.assertEquals(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1522 self.assertEquals(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1523
Martin v. Löwis43c57782009-05-10 08:15:24 +00001524class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001525
1526 def test_utf8(self):
1527 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001528 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001529 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001530 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001531 b"foo\x80bar")
1532 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001533 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001534 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001535 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001536 b"\xed\xb0\x80")
1537
1538 def test_ascii(self):
1539 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001540 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001541 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001542 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001543 b"foo\x80bar")
1544
1545 def test_charmap(self):
1546 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001547 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001548 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001549 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001550 b"foo\xa5bar")
1551
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001552 def test_latin1(self):
1553 # Issue6373
1554 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin1", "surrogateescape"),
1555 b"\xe4\xeb\xef\xf6\xfc")
1556
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001557
Fred Drake2e2be372001-09-20 21:33:42 +00001558def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001559 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00001560 UTF32Test,
1561 UTF32LETest,
1562 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001563 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001564 UTF16LETest,
1565 UTF16BETest,
1566 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001567 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001568 UTF7Test,
1569 UTF16ExTest,
1570 ReadBufferTest,
1571 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001572 RecodingTest,
1573 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001574 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001575 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001576 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001577 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001578 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001579 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001580 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001581 CharmapTest,
1582 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001583 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001584 SurrogateEscapeTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001585 )
Fred Drake2e2be372001-09-20 21:33:42 +00001586
1587
1588if __name__ == "__main__":
1589 test_main()