blob: d0bcf569538188ee4a22095b5a48285549aef173 [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Barry Warsaw04f357c2002-07-23 19:04:11 +00002import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00004import sys, _testcapi, io
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000010 def __init__(self, buffer):
11 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000012
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000019 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000020 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwald3abcb012007-04-16 22:10:50 +000026class MixInCheckStateHandling:
27 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000028 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000029 d = codecs.getincrementaldecoder(encoding)()
30 part1 = d.decode(s[:i])
31 state = d.getstate()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000032 self.assertTrue(isinstance(state[1], int))
Walter Dörwald3abcb012007-04-16 22:10:50 +000033 # Check that the condition stated in the documentation for
34 # IncrementalDecoder.getstate() holds
35 if not state[1]:
36 # reset decoder to the default state without anything buffered
37 d.setstate((state[0][:0], 0))
38 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000039 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000040 # The decoder must return to the same state
41 self.assertEqual(state, d.getstate())
42 # Create a new decoder and set it to the state
43 # we extracted from the old one
44 d = codecs.getincrementaldecoder(encoding)()
45 d.setstate(state)
46 part2 = d.decode(s[i:], True)
47 self.assertEqual(u, part1+part2)
48
49 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000050 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000051 d = codecs.getincrementalencoder(encoding)()
52 part1 = d.encode(u[:i])
53 state = d.getstate()
54 d = codecs.getincrementalencoder(encoding)()
55 d.setstate(state)
56 part2 = d.encode(u[i:], True)
57 self.assertEqual(s, part1+part2)
58
59class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000060 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000061 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000062 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000063 # the StreamReader and check that the results equal the appropriate
64 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000065 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +000066 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000067 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000068 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000069 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000070 result += r.read()
71 self.assertEqual(result, partialresult)
72 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000073 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000074 self.assertEqual(r.bytebuffer, b"")
Guido van Rossumef87d6e2007-05-02 19:09:54 +000075 self.assertEqual(r.charbuffer, "")
Walter Dörwald69652032004-09-07 20:24:22 +000076
Thomas Woutersa9773292006-04-21 09:43:23 +000077 # do the check again, this time using a incremental decoder
78 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000079 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000080 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000081 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000082 self.assertEqual(result, partialresult)
83 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000084 self.assertEqual(d.decode(b"", True), "")
85 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000086
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000088 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000089 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000090 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000091 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000092 self.assertEqual(result, partialresult)
93 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000094 self.assertEqual(d.decode(b"", True), "")
95 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000096
97 # check iterdecode()
98 encoded = input.encode(self.encoding)
99 self.assertEqual(
100 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000102 )
103
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000104 def test_readline(self):
105 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000106 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000107 return codecs.getreader(self.encoding)(stream)
108
Walter Dörwaldca199432006-03-06 22:39:12 +0000109 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000110 reader = getreader(input)
111 lines = []
112 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000113 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000114 if not line:
115 break
116 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000117 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000118
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000119 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
120 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
121 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000122 self.assertEqual(readalllines(s, True), sexpected)
123 self.assertEqual(readalllines(s, False), sexpectednoends)
124 self.assertEqual(readalllines(s, True, 10), sexpected)
125 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000126
127 # Test long lines (multiple calls to read() in readline())
128 vw = []
129 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000130 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
131 vw.append((i*200)*"\3042" + lineend)
132 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000133 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
134 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
135
136 # Test lines where the first read might end with \r, so the
137 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000138 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000139 for lineend in "\n \r\n \r \u2028".split():
140 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000141 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000142 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000143 self.assertEqual(
144 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000145 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000146 )
147 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000148 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000149 self.assertEqual(
150 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000151 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000152 )
153
154 def test_bug1175396(self):
155 s = [
156 '<%!--===================================================\r\n',
157 ' BLOG index page: show recent articles,\r\n',
158 ' today\'s articles, or articles of a specific date.\r\n',
159 '========================================================--%>\r\n',
160 '<%@inputencoding="ISO-8859-1"%>\r\n',
161 '<%@pagetemplate=TEMPLATE.y%>\r\n',
162 '<%@import=import frog.util, frog%>\r\n',
163 '<%@import=import frog.objects%>\r\n',
164 '<%@import=from frog.storageerrors import StorageError%>\r\n',
165 '<%\r\n',
166 '\r\n',
167 'import logging\r\n',
168 'log=logging.getLogger("Snakelets.logger")\r\n',
169 '\r\n',
170 '\r\n',
171 'user=self.SessionCtx.user\r\n',
172 'storageEngine=self.SessionCtx.storageEngine\r\n',
173 '\r\n',
174 '\r\n',
175 'def readArticlesFromDate(date, count=None):\r\n',
176 ' entryids=storageEngine.listBlogEntries(date)\r\n',
177 ' entryids.reverse() # descending\r\n',
178 ' if count:\r\n',
179 ' entryids=entryids[:count]\r\n',
180 ' try:\r\n',
181 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
182 ' except StorageError,x:\r\n',
183 ' log.error("Error loading articles: "+str(x))\r\n',
184 ' self.abort("cannot load articles")\r\n',
185 '\r\n',
186 'showdate=None\r\n',
187 '\r\n',
188 'arg=self.Request.getArg()\r\n',
189 'if arg=="today":\r\n',
190 ' #-------------------- TODAY\'S ARTICLES\r\n',
191 ' self.write("<h2>Today\'s articles</h2>")\r\n',
192 ' showdate = frog.util.isodatestr() \r\n',
193 ' entries = readArticlesFromDate(showdate)\r\n',
194 'elif arg=="active":\r\n',
195 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
196 ' self.Yredirect("active.y")\r\n',
197 'elif arg=="login":\r\n',
198 ' #-------------------- LOGIN PAGE redirect\r\n',
199 ' self.Yredirect("login.y")\r\n',
200 'elif arg=="date":\r\n',
201 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
202 ' showdate = self.Request.getParameter("date")\r\n',
203 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
204 ' entries = readArticlesFromDate(showdate)\r\n',
205 'else:\r\n',
206 ' #-------------------- RECENT ARTICLES\r\n',
207 ' self.write("<h2>Recent articles</h2>")\r\n',
208 ' dates=storageEngine.listBlogEntryDates()\r\n',
209 ' if dates:\r\n',
210 ' entries=[]\r\n',
211 ' SHOWAMOUNT=10\r\n',
212 ' for showdate in dates:\r\n',
213 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
214 ' if len(entries)>=SHOWAMOUNT:\r\n',
215 ' break\r\n',
216 ' \r\n',
217 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000218 stream = io.BytesIO("".join(s).encode(self.encoding))
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000219 reader = codecs.getreader(self.encoding)(stream)
220 for (i, line) in enumerate(reader):
221 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000222
223 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000224 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000225 writer = codecs.getwriter(self.encoding)(q)
226 reader = codecs.getreader(self.encoding)(q)
227
228 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000229 writer.write("foo\r")
230 self.assertEqual(reader.readline(keepends=False), "foo")
231 writer.write("\nbar\r")
232 self.assertEqual(reader.readline(keepends=False), "")
233 self.assertEqual(reader.readline(keepends=False), "bar")
234 writer.write("baz")
235 self.assertEqual(reader.readline(keepends=False), "baz")
236 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000237
238 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000239 writer.write("foo\r")
240 self.assertEqual(reader.readline(keepends=True), "foo\r")
241 writer.write("\nbar\r")
242 self.assertEqual(reader.readline(keepends=True), "\n")
243 self.assertEqual(reader.readline(keepends=True), "bar\r")
244 writer.write("baz")
245 self.assertEqual(reader.readline(keepends=True), "baz")
246 self.assertEqual(reader.readline(keepends=True), "")
247 writer.write("foo\r\n")
248 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000249
Walter Dörwald9fa09462005-01-10 12:01:39 +0000250 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000251 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
252 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
253 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000254
255 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000256 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000257 reader = codecs.getreader(self.encoding)(stream)
258 self.assertEqual(reader.readline(), s1)
259 self.assertEqual(reader.readline(), s2)
260 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000261 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000262
263 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000264 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
265 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
266 s3 = "stillokay:bbbbxx\r\n"
267 s4 = "broken!!!!badbad\r\n"
268 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000269
270 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000271 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000272 reader = codecs.getreader(self.encoding)(stream)
273 self.assertEqual(reader.readline(), s1)
274 self.assertEqual(reader.readline(), s2)
275 self.assertEqual(reader.readline(), s3)
276 self.assertEqual(reader.readline(), s4)
277 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000278 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000279
Walter Dörwald41980ca2007-08-16 21:55:45 +0000280class UTF32Test(ReadTest):
281 encoding = "utf-32"
282
283 spamle = (b'\xff\xfe\x00\x00'
284 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
285 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
286 spambe = (b'\x00\x00\xfe\xff'
287 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
288 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
289
290 def test_only_one_bom(self):
291 _,_,reader,writer = codecs.lookup(self.encoding)
292 # encode some stream
293 s = io.BytesIO()
294 f = writer(s)
295 f.write("spam")
296 f.write("spam")
297 d = s.getvalue()
298 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000299 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000300 # try to read it back
301 s = io.BytesIO(d)
302 f = reader(s)
303 self.assertEquals(f.read(), "spamspam")
304
305 def test_badbom(self):
306 s = io.BytesIO(4*b"\xff")
307 f = codecs.getreader(self.encoding)(s)
308 self.assertRaises(UnicodeError, f.read)
309
310 s = io.BytesIO(8*b"\xff")
311 f = codecs.getreader(self.encoding)(s)
312 self.assertRaises(UnicodeError, f.read)
313
314 def test_partial(self):
315 self.check_partial(
316 "\x00\xff\u0100\uffff",
317 [
318 "", # first byte of BOM read
319 "", # second byte of BOM read
320 "", # third byte of BOM read
321 "", # fourth byte of BOM read => byteorder known
322 "",
323 "",
324 "",
325 "\x00",
326 "\x00",
327 "\x00",
328 "\x00",
329 "\x00\xff",
330 "\x00\xff",
331 "\x00\xff",
332 "\x00\xff",
333 "\x00\xff\u0100",
334 "\x00\xff\u0100",
335 "\x00\xff\u0100",
336 "\x00\xff\u0100",
337 "\x00\xff\u0100\uffff",
338 ]
339 )
340
Georg Brandl791f4e12009-09-17 11:41:24 +0000341 def test_handlers(self):
342 self.assertEqual(('\ufffd', 1),
343 codecs.utf_32_decode(b'\x01', 'replace', True))
344 self.assertEqual(('', 1),
345 codecs.utf_32_decode(b'\x01', 'ignore', True))
346
Walter Dörwald41980ca2007-08-16 21:55:45 +0000347 def test_errors(self):
348 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
349 b"\xff", "strict", True)
350
351 def test_decoder_state(self):
352 self.check_state_handling_decode(self.encoding,
353 "spamspam", self.spamle)
354 self.check_state_handling_decode(self.encoding,
355 "spamspam", self.spambe)
356
357class UTF32LETest(ReadTest):
358 encoding = "utf-32-le"
359
360 def test_partial(self):
361 self.check_partial(
362 "\x00\xff\u0100\uffff",
363 [
364 "",
365 "",
366 "",
367 "\x00",
368 "\x00",
369 "\x00",
370 "\x00",
371 "\x00\xff",
372 "\x00\xff",
373 "\x00\xff",
374 "\x00\xff",
375 "\x00\xff\u0100",
376 "\x00\xff\u0100",
377 "\x00\xff\u0100",
378 "\x00\xff\u0100",
379 "\x00\xff\u0100\uffff",
380 ]
381 )
382
383 def test_simple(self):
384 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
385
386 def test_errors(self):
387 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
388 b"\xff", "strict", True)
389
390class UTF32BETest(ReadTest):
391 encoding = "utf-32-be"
392
393 def test_partial(self):
394 self.check_partial(
395 "\x00\xff\u0100\uffff",
396 [
397 "",
398 "",
399 "",
400 "\x00",
401 "\x00",
402 "\x00",
403 "\x00",
404 "\x00\xff",
405 "\x00\xff",
406 "\x00\xff",
407 "\x00\xff",
408 "\x00\xff\u0100",
409 "\x00\xff\u0100",
410 "\x00\xff\u0100",
411 "\x00\xff\u0100",
412 "\x00\xff\u0100\uffff",
413 ]
414 )
415
416 def test_simple(self):
417 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
418
419 def test_errors(self):
420 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
421 b"\xff", "strict", True)
422
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000423class UTF16Test(ReadTest):
424 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000425
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000426 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
427 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000428
429 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000430 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000431 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000432 s = io.BytesIO()
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000433 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000434 f.write("spam")
435 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000436 d = s.getvalue()
437 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000438 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000439 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000440 s = io.BytesIO(d)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000441 f = reader(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000442 self.assertEquals(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000443
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000444 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000445 s = io.BytesIO(b"\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000446 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000447 self.assertRaises(UnicodeError, f.read)
448
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000449 s = io.BytesIO(b"\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000450 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000451 self.assertRaises(UnicodeError, f.read)
452
Walter Dörwald69652032004-09-07 20:24:22 +0000453 def test_partial(self):
454 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000455 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000456 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000457 "", # first byte of BOM read
458 "", # second byte of BOM read => byteorder known
459 "",
460 "\x00",
461 "\x00",
462 "\x00\xff",
463 "\x00\xff",
464 "\x00\xff\u0100",
465 "\x00\xff\u0100",
466 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000467 ]
468 )
469
Georg Brandl791f4e12009-09-17 11:41:24 +0000470 def test_handlers(self):
471 self.assertEqual(('\ufffd', 1),
472 codecs.utf_16_decode(b'\x01', 'replace', True))
473 self.assertEqual(('', 1),
474 codecs.utf_16_decode(b'\x01', 'ignore', True))
475
Walter Dörwalde22d3392005-11-17 08:52:34 +0000476 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000477 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000478 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000479
480 def test_decoder_state(self):
481 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000482 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000483 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000484 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000485
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000486class UTF16LETest(ReadTest):
487 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000488
489 def test_partial(self):
490 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000491 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000492 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000493 "",
494 "\x00",
495 "\x00",
496 "\x00\xff",
497 "\x00\xff",
498 "\x00\xff\u0100",
499 "\x00\xff\u0100",
500 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000501 ]
502 )
503
Walter Dörwalde22d3392005-11-17 08:52:34 +0000504 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000505 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000506 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000507
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000508class UTF16BETest(ReadTest):
509 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000510
511 def test_partial(self):
512 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000513 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000514 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000515 "",
516 "\x00",
517 "\x00",
518 "\x00\xff",
519 "\x00\xff",
520 "\x00\xff\u0100",
521 "\x00\xff\u0100",
522 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000523 ]
524 )
525
Walter Dörwalde22d3392005-11-17 08:52:34 +0000526 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000527 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000528 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000529
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000530class UTF8Test(ReadTest):
531 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000532
533 def test_partial(self):
534 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000535 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000536 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000537 "\x00",
538 "\x00",
539 "\x00\xff",
540 "\x00\xff",
541 "\x00\xff\u07ff",
542 "\x00\xff\u07ff",
543 "\x00\xff\u07ff",
544 "\x00\xff\u07ff\u0800",
545 "\x00\xff\u07ff\u0800",
546 "\x00\xff\u07ff\u0800",
547 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000548 ]
549 )
550
Walter Dörwald3abcb012007-04-16 22:10:50 +0000551 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000552 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000553 self.check_state_handling_decode(self.encoding,
554 u, u.encode(self.encoding))
555
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000556 def test_lone_surrogates(self):
557 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
558 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
559
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000560 def test_surrogatepass_handler(self):
561 self.assertEquals("abc\ud800def".encode("utf-8", "surrogatepass"),
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000562 b"abc\xed\xa0\x80def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000563 self.assertEquals(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000564 "abc\ud800def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000565 self.assertTrue(codecs.lookup_error("surrogatepass"))
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000566
Walter Dörwalde22d3392005-11-17 08:52:34 +0000567class UTF7Test(ReadTest):
568 encoding = "utf-7"
569
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000570 def test_partial(self):
571 self.check_partial(
572 "a+-b",
573 [
574 "a",
575 "a",
576 "a+",
577 "a+-",
578 "a+-b",
579 ]
580 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000581
582class UTF16ExTest(unittest.TestCase):
583
584 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000585 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000586
587 def test_bad_args(self):
588 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
589
590class ReadBufferTest(unittest.TestCase):
591
592 def test_array(self):
593 import array
594 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000595 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000596 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000597 )
598
599 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000600 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000601
602 def test_bad_args(self):
603 self.assertRaises(TypeError, codecs.readbuffer_encode)
604 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
605
606class CharBufferTest(unittest.TestCase):
607
608 def test_string(self):
Guido van Rossum09549f42007-08-27 20:40:10 +0000609 self.assertEqual(codecs.charbuffer_encode(b"spam"), (b"spam", 4))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000610
611 def test_empty(self):
Guido van Rossum09549f42007-08-27 20:40:10 +0000612 self.assertEqual(codecs.charbuffer_encode(b""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000613
614 def test_bad_args(self):
615 self.assertRaises(TypeError, codecs.charbuffer_encode)
616 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
617
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000618class UTF8SigTest(ReadTest):
619 encoding = "utf-8-sig"
620
621 def test_partial(self):
622 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000623 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000624 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000625 "",
626 "",
627 "", # First BOM has been read and skipped
628 "",
629 "",
630 "\ufeff", # Second BOM has been read and emitted
631 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000632 "\ufeff\x00", # First byte of encoded "\xff" read
633 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
634 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
635 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000636 "\ufeff\x00\xff\u07ff",
637 "\ufeff\x00\xff\u07ff",
638 "\ufeff\x00\xff\u07ff\u0800",
639 "\ufeff\x00\xff\u07ff\u0800",
640 "\ufeff\x00\xff\u07ff\u0800",
641 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000642 ]
643 )
644
Thomas Wouters89f507f2006-12-13 04:49:30 +0000645 def test_bug1601501(self):
646 # SF bug #1601501: check that the codec works with a buffer
Antoine Pitrou616d2852008-08-19 22:09:34 +0000647 self.assertEquals(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000648
Walter Dörwald3abcb012007-04-16 22:10:50 +0000649 def test_bom(self):
650 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000651 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000652 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
653
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000654 def test_stream_bom(self):
655 unistring = "ABC\u00A1\u2200XYZ"
656 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
657
658 reader = codecs.getreader("utf-8-sig")
659 for sizehint in [None] + list(range(1, 11)) + \
660 [64, 128, 256, 512, 1024]:
661 istream = reader(io.BytesIO(bytestring))
662 ostream = io.StringIO()
663 while 1:
664 if sizehint is not None:
665 data = istream.read(sizehint)
666 else:
667 data = istream.read()
668
669 if not data:
670 break
671 ostream.write(data)
672
673 got = ostream.getvalue()
674 self.assertEqual(got, unistring)
675
676 def test_stream_bare(self):
677 unistring = "ABC\u00A1\u2200XYZ"
678 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
679
680 reader = codecs.getreader("utf-8-sig")
681 for sizehint in [None] + list(range(1, 11)) + \
682 [64, 128, 256, 512, 1024]:
683 istream = reader(io.BytesIO(bytestring))
684 ostream = io.StringIO()
685 while 1:
686 if sizehint is not None:
687 data = istream.read(sizehint)
688 else:
689 data = istream.read()
690
691 if not data:
692 break
693 ostream.write(data)
694
695 got = ostream.getvalue()
696 self.assertEqual(got, unistring)
697
698class EscapeDecodeTest(unittest.TestCase):
699 def test_empty(self):
700 self.assertEquals(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000701
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000702class RecodingTest(unittest.TestCase):
703 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000704 f = io.BytesIO()
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000705 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000706 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000707 f2.close()
708 # Python used to crash on this at exit because of a refcount
709 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000710
Martin v. Löwis2548c732003-04-18 10:39:54 +0000711# From RFC 3492
712punycode_testcases = [
713 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000714 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
715 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000716 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000717 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000718 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000719 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000720 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000721 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000722 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000723 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000724 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
725 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
726 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000727 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000728 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000729 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
730 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
731 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000732 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000733 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000734 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000735 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
736 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
737 "\u0939\u0948\u0902",
738 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000739
740 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000741 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000742 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
743 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000744
745 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000746 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
747 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
748 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000749 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
750 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000751
752 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000753 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
754 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
755 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
756 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000757 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000758
759 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000760 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
761 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
762 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
763 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
764 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000765 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000766
767 # (K) Vietnamese:
768 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
769 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000770 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
771 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
772 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
773 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000774 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000775
Martin v. Löwis2548c732003-04-18 10:39:54 +0000776 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000777 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000778 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000779
Martin v. Löwis2548c732003-04-18 10:39:54 +0000780 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000781 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
782 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
783 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000784 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000785
786 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000787 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
788 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
789 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000790 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000791
792 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000793 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000794 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000795
796 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000797 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
798 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000799 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000800
801 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000802 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000803 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000804
805 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000806 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000807 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000808
809 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000810 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
811 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000812 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000813 ]
814
815for i in punycode_testcases:
816 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000817 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000818
819class PunycodeTest(unittest.TestCase):
820 def test_encode(self):
821 for uni, puny in punycode_testcases:
822 # Need to convert both strings to lower case, since
823 # some of the extended encodings use upper case, but our
824 # code produces only lower case. Converting just puny to
825 # lower is also insufficient, since some of the input characters
826 # are upper case.
Walter Dörwalda4c61282007-05-10 12:36:25 +0000827 self.assertEquals(
828 str(uni.encode("punycode"), "ascii").lower(),
829 str(puny, "ascii").lower()
830 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000831
832 def test_decode(self):
833 for uni, puny in punycode_testcases:
834 self.assertEquals(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000835 puny = puny.decode("ascii").encode("ascii")
836 self.assertEquals(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000837
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000838class UnicodeInternalTest(unittest.TestCase):
839 def test_bug1251300(self):
840 # Decoding with unicode_internal used to not correctly handle "code
841 # points" above 0x10ffff on UCS-4 builds.
842 if sys.maxunicode > 0xffff:
843 ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000844 (b"\x00\x10\xff\xff", "\U0010ffff"),
845 (b"\x00\x00\x01\x01", "\U00000101"),
846 (b"", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000847 ]
848 not_ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000849 b"\x7f\xff\xff\xff",
850 b"\x80\x00\x00\x00",
851 b"\x81\x00\x00\x00",
852 b"\x00",
853 b"\x00\x00\x00\x00\x00",
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000854 ]
855 for internal, uni in ok:
856 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000857 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000858 self.assertEquals(uni, internal.decode("unicode_internal"))
859 for internal in not_ok:
860 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000861 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000862 self.assertRaises(UnicodeDecodeError, internal.decode,
863 "unicode_internal")
864
865 def test_decode_error_attributes(self):
866 if sys.maxunicode > 0xffff:
867 try:
Walter Dörwald092a2252007-06-07 11:26:16 +0000868 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +0000869 except UnicodeDecodeError as ex:
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000870 self.assertEquals("unicode_internal", ex.encoding)
Walter Dörwald092a2252007-06-07 11:26:16 +0000871 self.assertEquals(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000872 self.assertEquals(4, ex.start)
873 self.assertEquals(8, ex.end)
874 else:
875 self.fail()
876
877 def test_decode_callback(self):
878 if sys.maxunicode > 0xffff:
879 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
880 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossum98297ee2007-11-06 21:34:58 +0000881 ab = "ab".encode("unicode_internal").decode()
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000882 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
883 "ascii"),
884 "UnicodeInternalTest")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000885 self.assertEquals(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000886
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000887 def test_encode_length(self):
888 # Issue 3739
889 encoder = codecs.getencoder("unicode_internal")
890 self.assertEquals(encoder("a")[1], 1)
891 self.assertEquals(encoder("\xe9\u0142")[1], 2)
892
Martin v. Löwis2548c732003-04-18 10:39:54 +0000893# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
894nameprep_tests = [
895 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000896 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
897 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
898 b'\xb8\x8f\xef\xbb\xbf',
899 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000900 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000901 (b'CAFE',
902 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000903 # 3.3 Case folding 8bit U+00DF (german sharp s).
904 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000905 (b'\xc3\x9f',
906 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000907 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000908 (b'\xc4\xb0',
909 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000910 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000911 (b'\xc5\x83\xcd\xba',
912 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000913 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
914 # XXX: skip this as it fails in UCS-2 mode
915 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
916 # 'telc\xe2\x88\x95kg\xcf\x83'),
917 (None, None),
918 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000919 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
920 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000921 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000922 (b'\xe1\xbe\xb7',
923 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000924 # 3.9 Self-reverting case folding U+01F0 and normalization.
925 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000926 (b'\xc7\xb0',
927 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000928 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000929 (b'\xce\x90',
930 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000931 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000932 (b'\xce\xb0',
933 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000934 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000935 (b'\xe1\xba\x96',
936 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000937 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000938 (b'\xe1\xbd\x96',
939 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000940 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000941 (b' ',
942 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000943 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000944 (b'\xc2\xa0',
945 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000946 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000947 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000948 None),
949 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000950 (b'\xe2\x80\x80',
951 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000952 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000953 (b'\xe2\x80\x8b',
954 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000955 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000956 (b'\xe3\x80\x80',
957 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000958 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000959 (b'\x10\x7f',
960 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000961 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000962 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000963 None),
964 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000965 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000966 None),
967 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000968 (b'\xef\xbb\xbf',
969 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000970 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000971 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000972 None),
973 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000974 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000975 None),
976 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000977 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000978 None),
979 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000980 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000981 None),
982 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000983 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000984 None),
985 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000986 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000987 None),
988 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000989 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000990 None),
991 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000992 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000993 None),
994 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000995 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000996 None),
997 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000998 (b'\xcd\x81',
999 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001000 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001001 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001002 None),
1003 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001004 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001005 None),
1006 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001007 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001008 None),
1009 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001010 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001011 None),
1012 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001013 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001014 None),
1015 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001016 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001017 None),
1018 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001019 (b'foo\xef\xb9\xb6bar',
1020 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001021 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001022 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001023 None),
1024 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001025 (b'\xd8\xa71\xd8\xa8',
1026 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001027 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001028 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001029 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001030 # None),
1031 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001032 # 3.44 Larger test (shrinking).
1033 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001034 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1035 b'\xaa\xce\xb0\xe2\x80\x80',
1036 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001037 # 3.45 Larger test (expanding).
1038 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001039 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1040 b'\x80',
1041 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1042 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1043 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001044 ]
1045
1046
1047class NameprepTest(unittest.TestCase):
1048 def test_nameprep(self):
1049 from encodings.idna import nameprep
1050 for pos, (orig, prepped) in enumerate(nameprep_tests):
1051 if orig is None:
1052 # Skipped
1053 continue
1054 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001055 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001056 if prepped is None:
1057 # Input contains prohibited characters
1058 self.assertRaises(UnicodeError, nameprep, orig)
1059 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001060 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001061 try:
1062 self.assertEquals(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001063 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001064 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001065
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001066class IDNACodecTest(unittest.TestCase):
1067 def test_builtin_decode(self):
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001068 self.assertEquals(str(b"python.org", "idna"), "python.org")
1069 self.assertEquals(str(b"python.org.", "idna"), "python.org.")
1070 self.assertEquals(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1071 self.assertEquals(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001072
1073 def test_builtin_encode(self):
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001074 self.assertEquals("python.org".encode("idna"), b"python.org")
1075 self.assertEquals("python.org.".encode("idna"), b"python.org.")
1076 self.assertEquals("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1077 self.assertEquals("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001078
Martin v. Löwis8b595142005-08-25 11:03:38 +00001079 def test_stream(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001080 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001081 r.read(3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001082 self.assertEquals(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001083
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001084 def test_incremental_decode(self):
1085 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001086 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001087 "python.org"
1088 )
1089 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001090 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001091 "python.org."
1092 )
1093 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001094 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001095 "pyth\xf6n.org."
1096 )
1097 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001098 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001099 "pyth\xf6n.org."
1100 )
1101
1102 decoder = codecs.getincrementaldecoder("idna")()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001103 self.assertEquals(decoder.decode(b"xn--xam", ), "")
1104 self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1105 self.assertEquals(decoder.decode(b"rg"), "")
1106 self.assertEquals(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001107
1108 decoder.reset()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001109 self.assertEquals(decoder.decode(b"xn--xam", ), "")
1110 self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1111 self.assertEquals(decoder.decode(b"rg."), "org.")
1112 self.assertEquals(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001113
1114 def test_incremental_encode(self):
1115 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001116 b"".join(codecs.iterencode("python.org", "idna")),
1117 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001118 )
1119 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001120 b"".join(codecs.iterencode("python.org.", "idna")),
1121 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001122 )
1123 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001124 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1125 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001126 )
1127 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001128 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1129 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001130 )
1131
1132 encoder = codecs.getincrementalencoder("idna")()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001133 self.assertEquals(encoder.encode("\xe4x"), b"")
1134 self.assertEquals(encoder.encode("ample.org"), b"xn--xample-9ta.")
1135 self.assertEquals(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001136
1137 encoder.reset()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001138 self.assertEquals(encoder.encode("\xe4x"), b"")
1139 self.assertEquals(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1140 self.assertEquals(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001141
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001142class CodecsModuleTest(unittest.TestCase):
1143
1144 def test_decode(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001145 self.assertEquals(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001146 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001147 self.assertRaises(TypeError, codecs.decode)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001148 self.assertEquals(codecs.decode(b'abc'), 'abc')
1149 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001150
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001151 def test_encode(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001152 self.assertEquals(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001153 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001154 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001155 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001156 self.assertEquals(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001157 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001158
1159 def test_register(self):
1160 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001161 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001162
1163 def test_lookup(self):
1164 self.assertRaises(TypeError, codecs.lookup)
1165 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001166 self.assertRaises(LookupError, codecs.lookup, " ")
1167
1168 def test_getencoder(self):
1169 self.assertRaises(TypeError, codecs.getencoder)
1170 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1171
1172 def test_getdecoder(self):
1173 self.assertRaises(TypeError, codecs.getdecoder)
1174 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1175
1176 def test_getreader(self):
1177 self.assertRaises(TypeError, codecs.getreader)
1178 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1179
1180 def test_getwriter(self):
1181 self.assertRaises(TypeError, codecs.getwriter)
1182 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001183
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001184class StreamReaderTest(unittest.TestCase):
1185
1186 def setUp(self):
1187 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001188 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001189
1190 def test_readlines(self):
1191 f = self.reader(self.stream)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001192 self.assertEquals(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001193
Thomas Wouters89f507f2006-12-13 04:49:30 +00001194class EncodedFileTest(unittest.TestCase):
1195
1196 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001197 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001198 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001199 self.assertEquals(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001200
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001201 f = io.BytesIO()
Thomas Wouters89f507f2006-12-13 04:49:30 +00001202 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001203 ef.write(b'\xc3\xbc')
1204 self.assertEquals(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001205
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001206all_unicode_encodings = [
1207 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001208 "big5",
1209 "big5hkscs",
1210 "charmap",
1211 "cp037",
1212 "cp1006",
1213 "cp1026",
1214 "cp1140",
1215 "cp1250",
1216 "cp1251",
1217 "cp1252",
1218 "cp1253",
1219 "cp1254",
1220 "cp1255",
1221 "cp1256",
1222 "cp1257",
1223 "cp1258",
1224 "cp424",
1225 "cp437",
1226 "cp500",
1227 "cp737",
1228 "cp775",
1229 "cp850",
1230 "cp852",
1231 "cp855",
1232 "cp856",
1233 "cp857",
1234 "cp860",
1235 "cp861",
1236 "cp862",
1237 "cp863",
1238 "cp864",
1239 "cp865",
1240 "cp866",
1241 "cp869",
1242 "cp874",
1243 "cp875",
1244 "cp932",
1245 "cp949",
1246 "cp950",
1247 "euc_jis_2004",
1248 "euc_jisx0213",
1249 "euc_jp",
1250 "euc_kr",
1251 "gb18030",
1252 "gb2312",
1253 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001254 "hp_roman8",
1255 "hz",
1256 "idna",
1257 "iso2022_jp",
1258 "iso2022_jp_1",
1259 "iso2022_jp_2",
1260 "iso2022_jp_2004",
1261 "iso2022_jp_3",
1262 "iso2022_jp_ext",
1263 "iso2022_kr",
1264 "iso8859_1",
1265 "iso8859_10",
1266 "iso8859_11",
1267 "iso8859_13",
1268 "iso8859_14",
1269 "iso8859_15",
1270 "iso8859_16",
1271 "iso8859_2",
1272 "iso8859_3",
1273 "iso8859_4",
1274 "iso8859_5",
1275 "iso8859_6",
1276 "iso8859_7",
1277 "iso8859_8",
1278 "iso8859_9",
1279 "johab",
1280 "koi8_r",
1281 "koi8_u",
1282 "latin_1",
1283 "mac_cyrillic",
1284 "mac_greek",
1285 "mac_iceland",
1286 "mac_latin2",
1287 "mac_roman",
1288 "mac_turkish",
1289 "palmos",
1290 "ptcp154",
1291 "punycode",
1292 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001293 "shift_jis",
1294 "shift_jis_2004",
1295 "shift_jisx0213",
1296 "tis_620",
1297 "unicode_escape",
1298 "unicode_internal",
1299 "utf_16",
1300 "utf_16_be",
1301 "utf_16_le",
1302 "utf_7",
1303 "utf_8",
1304]
1305
1306if hasattr(codecs, "mbcs_encode"):
1307 all_unicode_encodings.append("mbcs")
1308
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001309# The following encoding is not tested, because it's not supposed
1310# to work:
1311# "undefined"
1312
1313# The following encodings don't work in stateful mode
1314broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001315 "punycode",
1316 "unicode_internal"
1317]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001318broken_incremental_coders = broken_unicode_with_streams + [
1319 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001320]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001321
1322# The following encodings only support "strict" mode
1323only_strict_mode = [
1324 "idna",
Thomas Wouters89f507f2006-12-13 04:49:30 +00001325]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001326
Walter Dörwald3abcb012007-04-16 22:10:50 +00001327class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001328 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001329 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001330 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001331 name = codecs.lookup(encoding).name
1332 if encoding.endswith("_codec"):
1333 name += "_codec"
1334 elif encoding == "latin_1":
1335 name = "latin_1"
1336 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001337 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001338 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001339 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001340 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1341
1342 if encoding not in broken_unicode_with_streams:
1343 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001344 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001345 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001346 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001347 for c in s:
1348 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001349 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001350 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001351 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001352 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001353 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001354 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001355 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001356 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001357 decodedresult += reader.read()
1358 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1359
Thomas Wouters89f507f2006-12-13 04:49:30 +00001360 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001361 # check incremental decoder/encoder (fetched via the Python
1362 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001363 try:
1364 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001365 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001366 except LookupError: # no IncrementalEncoder
1367 pass
1368 else:
1369 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001370 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001371 for c in s:
1372 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001373 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001374 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001375 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001376 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001377 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001378 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001379 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1380
1381 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001382 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001383 for c in s:
1384 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001385 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001386 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001387 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001388 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001389 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001390 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001391 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1392
1393 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001394 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001395 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1396
1397 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001398 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1399 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001400
Thomas Wouters89f507f2006-12-13 04:49:30 +00001401 if encoding not in only_strict_mode:
1402 # check incremental decoder/encoder with errors argument
1403 try:
1404 encoder = codecs.getincrementalencoder(encoding)("ignore")
1405 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1406 except LookupError: # no IncrementalEncoder
1407 pass
1408 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001409 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001410 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001411 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001412 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1413
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001414 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001415 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001416 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001417 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1418
Walter Dörwald729c31f2005-03-14 19:06:30 +00001419 def test_seek(self):
1420 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001421 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001422 for encoding in all_unicode_encodings:
1423 if encoding == "idna": # FIXME: See SF bug #1163178
1424 continue
1425 if encoding in broken_unicode_with_streams:
1426 continue
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001427 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001428 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001429 # Test that calling seek resets the internal codec state and buffers
1430 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001431 data = reader.read()
1432 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001433
Walter Dörwalde22d3392005-11-17 08:52:34 +00001434 def test_bad_decode_args(self):
1435 for encoding in all_unicode_encodings:
1436 decoder = codecs.getdecoder(encoding)
1437 self.assertRaises(TypeError, decoder)
1438 if encoding not in ("idna", "punycode"):
1439 self.assertRaises(TypeError, decoder, 42)
1440
1441 def test_bad_encode_args(self):
1442 for encoding in all_unicode_encodings:
1443 encoder = codecs.getencoder(encoding)
1444 self.assertRaises(TypeError, encoder)
1445
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001446 def test_encoding_map_type_initialized(self):
1447 from encodings import cp1140
1448 # This used to crash, we are only verifying there's no crash.
1449 table_type = type(cp1140.encoding_table)
1450 self.assertEqual(table_type, table_type)
1451
Walter Dörwald3abcb012007-04-16 22:10:50 +00001452 def test_decoder_state(self):
1453 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001454 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001455 for encoding in all_unicode_encodings:
1456 if encoding not in broken_incremental_coders:
1457 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1458 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1459
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001460class CharmapTest(unittest.TestCase):
1461 def test_decode_with_string_map(self):
1462 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001463 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001464 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001465 )
1466
1467 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001468 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001469 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001470 )
1471
1472 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001473 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001474 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001475 )
1476
1477 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001478 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001479 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001480 )
1481
1482 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001483 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001484 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001485 )
1486
Guido van Rossum805365e2007-05-07 22:24:25 +00001487 allbytes = bytes(range(256))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001488 self.assertEquals(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001489 codecs.charmap_decode(allbytes, "ignore", ""),
1490 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001491 )
1492
Thomas Wouters89f507f2006-12-13 04:49:30 +00001493class WithStmtTest(unittest.TestCase):
1494 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001495 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001496 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001497 self.assertEquals(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001498
1499 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001500 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001501 info = codecs.lookup("utf-8")
1502 with codecs.StreamReaderWriter(f, info.streamreader,
1503 info.streamwriter, 'strict') as srw:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001504 self.assertEquals(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001505
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001506class TypesTest(unittest.TestCase):
1507 def test_decode_unicode(self):
1508 # Most decoders don't accept unicode input
1509 decoders = [
1510 codecs.utf_7_decode,
1511 codecs.utf_8_decode,
1512 codecs.utf_16_le_decode,
1513 codecs.utf_16_be_decode,
1514 codecs.utf_16_ex_decode,
1515 codecs.utf_32_decode,
1516 codecs.utf_32_le_decode,
1517 codecs.utf_32_be_decode,
1518 codecs.utf_32_ex_decode,
1519 codecs.latin_1_decode,
1520 codecs.ascii_decode,
1521 codecs.charmap_decode,
1522 ]
1523 if hasattr(codecs, "mbcs_decode"):
1524 decoders.append(codecs.mbcs_decode)
1525 for decoder in decoders:
1526 self.assertRaises(TypeError, decoder, "xxx")
1527
1528 def test_unicode_escape(self):
1529 # Escape-decoding an unicode string is supported ang gives the same
1530 # result as decoding the equivalent ASCII bytes string.
1531 self.assertEquals(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1532 self.assertEquals(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1533 self.assertEquals(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1534 self.assertEquals(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1535
Martin v. Löwis43c57782009-05-10 08:15:24 +00001536class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001537
1538 def test_utf8(self):
1539 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001540 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001541 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001542 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001543 b"foo\x80bar")
1544 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001545 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001546 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001547 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001548 b"\xed\xb0\x80")
1549
1550 def test_ascii(self):
1551 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001552 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001553 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001554 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001555 b"foo\x80bar")
1556
1557 def test_charmap(self):
1558 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001559 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001560 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001561 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001562 b"foo\xa5bar")
1563
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001564 def test_latin1(self):
1565 # Issue6373
1566 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin1", "surrogateescape"),
1567 b"\xe4\xeb\xef\xf6\xfc")
1568
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001569
Fred Drake2e2be372001-09-20 21:33:42 +00001570def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001571 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00001572 UTF32Test,
1573 UTF32LETest,
1574 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001575 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001576 UTF16LETest,
1577 UTF16BETest,
1578 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001579 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001580 UTF7Test,
1581 UTF16ExTest,
1582 ReadBufferTest,
1583 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001584 RecodingTest,
1585 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001586 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001587 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001588 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001589 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001590 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001591 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001592 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001593 CharmapTest,
1594 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001595 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001596 SurrogateEscapeTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001597 )
Fred Drake2e2be372001-09-20 21:33:42 +00001598
1599
1600if __name__ == "__main__":
1601 test_main()