blob: 7de1ed5f6bb2fc49efd3df13c73c8b955e1adaaa [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Barry Warsaw04f357c2002-07-23 19:04:11 +00002import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00004import sys, _testcapi, io
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000010 def __init__(self, buffer):
11 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000012
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000019 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000020 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwald3abcb012007-04-16 22:10:50 +000026class MixInCheckStateHandling:
27 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000028 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000029 d = codecs.getincrementaldecoder(encoding)()
30 part1 = d.decode(s[:i])
31 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000032 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000033 # Check that the condition stated in the documentation for
34 # IncrementalDecoder.getstate() holds
35 if not state[1]:
36 # reset decoder to the default state without anything buffered
37 d.setstate((state[0][:0], 0))
38 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000039 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000040 # The decoder must return to the same state
41 self.assertEqual(state, d.getstate())
42 # Create a new decoder and set it to the state
43 # we extracted from the old one
44 d = codecs.getincrementaldecoder(encoding)()
45 d.setstate(state)
46 part2 = d.decode(s[i:], True)
47 self.assertEqual(u, part1+part2)
48
49 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000050 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000051 d = codecs.getincrementalencoder(encoding)()
52 part1 = d.encode(u[:i])
53 state = d.getstate()
54 d = codecs.getincrementalencoder(encoding)()
55 d.setstate(state)
56 part2 = d.encode(u[i:], True)
57 self.assertEqual(s, part1+part2)
58
59class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000060 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000061 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000062 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000063 # the StreamReader and check that the results equal the appropriate
64 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000065 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +000066 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000067 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000068 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000069 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000070 result += r.read()
71 self.assertEqual(result, partialresult)
72 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000073 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000074 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000075
Thomas Woutersa9773292006-04-21 09:43:23 +000076 # do the check again, this time using a incremental decoder
77 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000078 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000079 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000080 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000081 self.assertEqual(result, partialresult)
82 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000083 self.assertEqual(d.decode(b"", True), "")
84 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000085
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000086 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000087 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000088 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000089 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000090 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000091 self.assertEqual(result, partialresult)
92 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000093 self.assertEqual(d.decode(b"", True), "")
94 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000095
96 # check iterdecode()
97 encoded = input.encode(self.encoding)
98 self.assertEqual(
99 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000100 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000101 )
102
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000103 def test_readline(self):
104 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000105 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000106 return codecs.getreader(self.encoding)(stream)
107
Walter Dörwaldca199432006-03-06 22:39:12 +0000108 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000109 reader = getreader(input)
110 lines = []
111 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000112 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000113 if not line:
114 break
115 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000116 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000117
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000118 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
119 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
120 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000121 self.assertEqual(readalllines(s, True), sexpected)
122 self.assertEqual(readalllines(s, False), sexpectednoends)
123 self.assertEqual(readalllines(s, True, 10), sexpected)
124 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000125
126 # Test long lines (multiple calls to read() in readline())
127 vw = []
128 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000129 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
130 vw.append((i*200)*"\3042" + lineend)
131 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000132 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
133 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
134
135 # Test lines where the first read might end with \r, so the
136 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000137 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000138 for lineend in "\n \r\n \r \u2028".split():
139 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000140 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000141 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000142 self.assertEqual(
143 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000144 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000145 )
146 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000147 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000148 self.assertEqual(
149 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000150 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000151 )
152
153 def test_bug1175396(self):
154 s = [
155 '<%!--===================================================\r\n',
156 ' BLOG index page: show recent articles,\r\n',
157 ' today\'s articles, or articles of a specific date.\r\n',
158 '========================================================--%>\r\n',
159 '<%@inputencoding="ISO-8859-1"%>\r\n',
160 '<%@pagetemplate=TEMPLATE.y%>\r\n',
161 '<%@import=import frog.util, frog%>\r\n',
162 '<%@import=import frog.objects%>\r\n',
163 '<%@import=from frog.storageerrors import StorageError%>\r\n',
164 '<%\r\n',
165 '\r\n',
166 'import logging\r\n',
167 'log=logging.getLogger("Snakelets.logger")\r\n',
168 '\r\n',
169 '\r\n',
170 'user=self.SessionCtx.user\r\n',
171 'storageEngine=self.SessionCtx.storageEngine\r\n',
172 '\r\n',
173 '\r\n',
174 'def readArticlesFromDate(date, count=None):\r\n',
175 ' entryids=storageEngine.listBlogEntries(date)\r\n',
176 ' entryids.reverse() # descending\r\n',
177 ' if count:\r\n',
178 ' entryids=entryids[:count]\r\n',
179 ' try:\r\n',
180 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
181 ' except StorageError,x:\r\n',
182 ' log.error("Error loading articles: "+str(x))\r\n',
183 ' self.abort("cannot load articles")\r\n',
184 '\r\n',
185 'showdate=None\r\n',
186 '\r\n',
187 'arg=self.Request.getArg()\r\n',
188 'if arg=="today":\r\n',
189 ' #-------------------- TODAY\'S ARTICLES\r\n',
190 ' self.write("<h2>Today\'s articles</h2>")\r\n',
191 ' showdate = frog.util.isodatestr() \r\n',
192 ' entries = readArticlesFromDate(showdate)\r\n',
193 'elif arg=="active":\r\n',
194 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
195 ' self.Yredirect("active.y")\r\n',
196 'elif arg=="login":\r\n',
197 ' #-------------------- LOGIN PAGE redirect\r\n',
198 ' self.Yredirect("login.y")\r\n',
199 'elif arg=="date":\r\n',
200 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
201 ' showdate = self.Request.getParameter("date")\r\n',
202 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
203 ' entries = readArticlesFromDate(showdate)\r\n',
204 'else:\r\n',
205 ' #-------------------- RECENT ARTICLES\r\n',
206 ' self.write("<h2>Recent articles</h2>")\r\n',
207 ' dates=storageEngine.listBlogEntryDates()\r\n',
208 ' if dates:\r\n',
209 ' entries=[]\r\n',
210 ' SHOWAMOUNT=10\r\n',
211 ' for showdate in dates:\r\n',
212 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
213 ' if len(entries)>=SHOWAMOUNT:\r\n',
214 ' break\r\n',
215 ' \r\n',
216 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000217 stream = io.BytesIO("".join(s).encode(self.encoding))
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000218 reader = codecs.getreader(self.encoding)(stream)
219 for (i, line) in enumerate(reader):
220 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000221
222 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000223 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000224 writer = codecs.getwriter(self.encoding)(q)
225 reader = codecs.getreader(self.encoding)(q)
226
227 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000228 writer.write("foo\r")
229 self.assertEqual(reader.readline(keepends=False), "foo")
230 writer.write("\nbar\r")
231 self.assertEqual(reader.readline(keepends=False), "")
232 self.assertEqual(reader.readline(keepends=False), "bar")
233 writer.write("baz")
234 self.assertEqual(reader.readline(keepends=False), "baz")
235 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000236
237 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000238 writer.write("foo\r")
239 self.assertEqual(reader.readline(keepends=True), "foo\r")
240 writer.write("\nbar\r")
241 self.assertEqual(reader.readline(keepends=True), "\n")
242 self.assertEqual(reader.readline(keepends=True), "bar\r")
243 writer.write("baz")
244 self.assertEqual(reader.readline(keepends=True), "baz")
245 self.assertEqual(reader.readline(keepends=True), "")
246 writer.write("foo\r\n")
247 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000248
Walter Dörwald9fa09462005-01-10 12:01:39 +0000249 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000250 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
251 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
252 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000253
254 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000255 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000256 reader = codecs.getreader(self.encoding)(stream)
257 self.assertEqual(reader.readline(), s1)
258 self.assertEqual(reader.readline(), s2)
259 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000260 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000261
262 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000263 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
264 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
265 s3 = "stillokay:bbbbxx\r\n"
266 s4 = "broken!!!!badbad\r\n"
267 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000268
269 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000270 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000271 reader = codecs.getreader(self.encoding)(stream)
272 self.assertEqual(reader.readline(), s1)
273 self.assertEqual(reader.readline(), s2)
274 self.assertEqual(reader.readline(), s3)
275 self.assertEqual(reader.readline(), s4)
276 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000277 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000278
Walter Dörwald41980ca2007-08-16 21:55:45 +0000279class UTF32Test(ReadTest):
280 encoding = "utf-32"
281
282 spamle = (b'\xff\xfe\x00\x00'
283 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
284 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
285 spambe = (b'\x00\x00\xfe\xff'
286 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
287 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
288
289 def test_only_one_bom(self):
290 _,_,reader,writer = codecs.lookup(self.encoding)
291 # encode some stream
292 s = io.BytesIO()
293 f = writer(s)
294 f.write("spam")
295 f.write("spam")
296 d = s.getvalue()
297 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000298 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000299 # try to read it back
300 s = io.BytesIO(d)
301 f = reader(s)
302 self.assertEquals(f.read(), "spamspam")
303
304 def test_badbom(self):
305 s = io.BytesIO(4*b"\xff")
306 f = codecs.getreader(self.encoding)(s)
307 self.assertRaises(UnicodeError, f.read)
308
309 s = io.BytesIO(8*b"\xff")
310 f = codecs.getreader(self.encoding)(s)
311 self.assertRaises(UnicodeError, f.read)
312
313 def test_partial(self):
314 self.check_partial(
315 "\x00\xff\u0100\uffff",
316 [
317 "", # first byte of BOM read
318 "", # second byte of BOM read
319 "", # third byte of BOM read
320 "", # fourth byte of BOM read => byteorder known
321 "",
322 "",
323 "",
324 "\x00",
325 "\x00",
326 "\x00",
327 "\x00",
328 "\x00\xff",
329 "\x00\xff",
330 "\x00\xff",
331 "\x00\xff",
332 "\x00\xff\u0100",
333 "\x00\xff\u0100",
334 "\x00\xff\u0100",
335 "\x00\xff\u0100",
336 "\x00\xff\u0100\uffff",
337 ]
338 )
339
Georg Brandl791f4e12009-09-17 11:41:24 +0000340 def test_handlers(self):
341 self.assertEqual(('\ufffd', 1),
342 codecs.utf_32_decode(b'\x01', 'replace', True))
343 self.assertEqual(('', 1),
344 codecs.utf_32_decode(b'\x01', 'ignore', True))
345
Walter Dörwald41980ca2007-08-16 21:55:45 +0000346 def test_errors(self):
347 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
348 b"\xff", "strict", True)
349
350 def test_decoder_state(self):
351 self.check_state_handling_decode(self.encoding,
352 "spamspam", self.spamle)
353 self.check_state_handling_decode(self.encoding,
354 "spamspam", self.spambe)
355
356class UTF32LETest(ReadTest):
357 encoding = "utf-32-le"
358
359 def test_partial(self):
360 self.check_partial(
361 "\x00\xff\u0100\uffff",
362 [
363 "",
364 "",
365 "",
366 "\x00",
367 "\x00",
368 "\x00",
369 "\x00",
370 "\x00\xff",
371 "\x00\xff",
372 "\x00\xff",
373 "\x00\xff",
374 "\x00\xff\u0100",
375 "\x00\xff\u0100",
376 "\x00\xff\u0100",
377 "\x00\xff\u0100",
378 "\x00\xff\u0100\uffff",
379 ]
380 )
381
382 def test_simple(self):
383 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
384
385 def test_errors(self):
386 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
387 b"\xff", "strict", True)
388
389class UTF32BETest(ReadTest):
390 encoding = "utf-32-be"
391
392 def test_partial(self):
393 self.check_partial(
394 "\x00\xff\u0100\uffff",
395 [
396 "",
397 "",
398 "",
399 "\x00",
400 "\x00",
401 "\x00",
402 "\x00",
403 "\x00\xff",
404 "\x00\xff",
405 "\x00\xff",
406 "\x00\xff",
407 "\x00\xff\u0100",
408 "\x00\xff\u0100",
409 "\x00\xff\u0100",
410 "\x00\xff\u0100",
411 "\x00\xff\u0100\uffff",
412 ]
413 )
414
415 def test_simple(self):
416 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
417
418 def test_errors(self):
419 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
420 b"\xff", "strict", True)
421
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000422class UTF16Test(ReadTest):
423 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000424
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000425 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
426 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000427
428 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000429 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000430 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000431 s = io.BytesIO()
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000432 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000433 f.write("spam")
434 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000435 d = s.getvalue()
436 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000437 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000438 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000439 s = io.BytesIO(d)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000440 f = reader(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000441 self.assertEquals(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000442
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000443 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000444 s = io.BytesIO(b"\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000445 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000446 self.assertRaises(UnicodeError, f.read)
447
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000448 s = io.BytesIO(b"\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000449 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000450 self.assertRaises(UnicodeError, f.read)
451
Walter Dörwald69652032004-09-07 20:24:22 +0000452 def test_partial(self):
453 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000454 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000455 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000456 "", # first byte of BOM read
457 "", # second byte of BOM read => byteorder known
458 "",
459 "\x00",
460 "\x00",
461 "\x00\xff",
462 "\x00\xff",
463 "\x00\xff\u0100",
464 "\x00\xff\u0100",
465 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000466 ]
467 )
468
Georg Brandl791f4e12009-09-17 11:41:24 +0000469 def test_handlers(self):
470 self.assertEqual(('\ufffd', 1),
471 codecs.utf_16_decode(b'\x01', 'replace', True))
472 self.assertEqual(('', 1),
473 codecs.utf_16_decode(b'\x01', 'ignore', True))
474
Walter Dörwalde22d3392005-11-17 08:52:34 +0000475 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000476 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000477 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000478
479 def test_decoder_state(self):
480 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000481 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000482 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000483 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000484
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000485 def test_bug691291(self):
486 # Files are always opened in binary mode, even if no binary mode was
487 # specified. This means that no automatic conversion of '\n' is done
488 # on reading and writing.
489 s1 = 'Hello\r\nworld\r\n'
490
491 s = s1.encode(self.encoding)
492 try:
493 with open(support.TESTFN, 'wb') as fp:
494 fp.write(s)
495 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
496 self.assertEqual(reader.read(), s1)
497 finally:
498 support.unlink(support.TESTFN)
499
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000500class UTF16LETest(ReadTest):
501 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000502
503 def test_partial(self):
504 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000505 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000506 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000507 "",
508 "\x00",
509 "\x00",
510 "\x00\xff",
511 "\x00\xff",
512 "\x00\xff\u0100",
513 "\x00\xff\u0100",
514 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000515 ]
516 )
517
Walter Dörwalde22d3392005-11-17 08:52:34 +0000518 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000519 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000520 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000521
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000522class UTF16BETest(ReadTest):
523 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000524
525 def test_partial(self):
526 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000527 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000528 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000529 "",
530 "\x00",
531 "\x00",
532 "\x00\xff",
533 "\x00\xff",
534 "\x00\xff\u0100",
535 "\x00\xff\u0100",
536 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000537 ]
538 )
539
Walter Dörwalde22d3392005-11-17 08:52:34 +0000540 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000541 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000542 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000543
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000544class UTF8Test(ReadTest):
545 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000546
547 def test_partial(self):
548 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000549 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000550 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000551 "\x00",
552 "\x00",
553 "\x00\xff",
554 "\x00\xff",
555 "\x00\xff\u07ff",
556 "\x00\xff\u07ff",
557 "\x00\xff\u07ff",
558 "\x00\xff\u07ff\u0800",
559 "\x00\xff\u07ff\u0800",
560 "\x00\xff\u07ff\u0800",
561 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000562 ]
563 )
564
Walter Dörwald3abcb012007-04-16 22:10:50 +0000565 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000566 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000567 self.check_state_handling_decode(self.encoding,
568 u, u.encode(self.encoding))
569
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000570 def test_lone_surrogates(self):
571 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
572 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000573 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
574 b'[\\udc80]')
575 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
576 b'[&#56448;]')
577 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
578 b'[\x80]')
579 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
580 b'[]')
581 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
582 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000583
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000584 def test_surrogatepass_handler(self):
585 self.assertEquals("abc\ud800def".encode("utf-8", "surrogatepass"),
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000586 b"abc\xed\xa0\x80def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000587 self.assertEquals(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000588 "abc\ud800def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000589 self.assertTrue(codecs.lookup_error("surrogatepass"))
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000590
Walter Dörwalde22d3392005-11-17 08:52:34 +0000591class UTF7Test(ReadTest):
592 encoding = "utf-7"
593
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000594 def test_partial(self):
595 self.check_partial(
596 "a+-b",
597 [
598 "a",
599 "a",
600 "a+",
601 "a+-",
602 "a+-b",
603 ]
604 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000605
606class UTF16ExTest(unittest.TestCase):
607
608 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000609 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000610
611 def test_bad_args(self):
612 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
613
614class ReadBufferTest(unittest.TestCase):
615
616 def test_array(self):
617 import array
618 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000619 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000620 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000621 )
622
623 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000624 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000625
626 def test_bad_args(self):
627 self.assertRaises(TypeError, codecs.readbuffer_encode)
628 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
629
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000630class UTF8SigTest(ReadTest):
631 encoding = "utf-8-sig"
632
633 def test_partial(self):
634 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000635 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000636 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000637 "",
638 "",
639 "", # First BOM has been read and skipped
640 "",
641 "",
642 "\ufeff", # Second BOM has been read and emitted
643 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000644 "\ufeff\x00", # First byte of encoded "\xff" read
645 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
646 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
647 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000648 "\ufeff\x00\xff\u07ff",
649 "\ufeff\x00\xff\u07ff",
650 "\ufeff\x00\xff\u07ff\u0800",
651 "\ufeff\x00\xff\u07ff\u0800",
652 "\ufeff\x00\xff\u07ff\u0800",
653 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000654 ]
655 )
656
Thomas Wouters89f507f2006-12-13 04:49:30 +0000657 def test_bug1601501(self):
658 # SF bug #1601501: check that the codec works with a buffer
Antoine Pitrou616d2852008-08-19 22:09:34 +0000659 self.assertEquals(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000660
Walter Dörwald3abcb012007-04-16 22:10:50 +0000661 def test_bom(self):
662 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000663 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000664 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
665
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000666 def test_stream_bom(self):
667 unistring = "ABC\u00A1\u2200XYZ"
668 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
669
670 reader = codecs.getreader("utf-8-sig")
671 for sizehint in [None] + list(range(1, 11)) + \
672 [64, 128, 256, 512, 1024]:
673 istream = reader(io.BytesIO(bytestring))
674 ostream = io.StringIO()
675 while 1:
676 if sizehint is not None:
677 data = istream.read(sizehint)
678 else:
679 data = istream.read()
680
681 if not data:
682 break
683 ostream.write(data)
684
685 got = ostream.getvalue()
686 self.assertEqual(got, unistring)
687
688 def test_stream_bare(self):
689 unistring = "ABC\u00A1\u2200XYZ"
690 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
691
692 reader = codecs.getreader("utf-8-sig")
693 for sizehint in [None] + list(range(1, 11)) + \
694 [64, 128, 256, 512, 1024]:
695 istream = reader(io.BytesIO(bytestring))
696 ostream = io.StringIO()
697 while 1:
698 if sizehint is not None:
699 data = istream.read(sizehint)
700 else:
701 data = istream.read()
702
703 if not data:
704 break
705 ostream.write(data)
706
707 got = ostream.getvalue()
708 self.assertEqual(got, unistring)
709
710class EscapeDecodeTest(unittest.TestCase):
711 def test_empty(self):
712 self.assertEquals(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000713
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000714class RecodingTest(unittest.TestCase):
715 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000716 f = io.BytesIO()
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000717 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000718 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000719 f2.close()
720 # Python used to crash on this at exit because of a refcount
721 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000722
Martin v. Löwis2548c732003-04-18 10:39:54 +0000723# From RFC 3492
724punycode_testcases = [
725 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000726 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
727 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000728 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000729 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000730 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000731 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000732 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000733 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000734 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000735 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000736 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
737 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
738 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000739 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000740 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000741 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
742 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
743 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000744 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000745 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000746 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000747 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
748 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
749 "\u0939\u0948\u0902",
750 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000751
752 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000753 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000754 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
755 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000756
757 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000758 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
759 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
760 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000761 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
762 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000763
764 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000765 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
766 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
767 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
768 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000769 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000770
771 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000772 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
773 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
774 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
775 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
776 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000777 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000778
779 # (K) Vietnamese:
780 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
781 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000782 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
783 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
784 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
785 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000786 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000787
Martin v. Löwis2548c732003-04-18 10:39:54 +0000788 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000789 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000790 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000791
Martin v. Löwis2548c732003-04-18 10:39:54 +0000792 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000793 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
794 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
795 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000796 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000797
798 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000799 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
800 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
801 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000802 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000803
804 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000805 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000806 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000807
808 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000809 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
810 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000811 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000812
813 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000814 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000815 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000816
817 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000818 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000819 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000820
821 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000822 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
823 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000824 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000825 ]
826
827for i in punycode_testcases:
828 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000829 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000830
831class PunycodeTest(unittest.TestCase):
832 def test_encode(self):
833 for uni, puny in punycode_testcases:
834 # Need to convert both strings to lower case, since
835 # some of the extended encodings use upper case, but our
836 # code produces only lower case. Converting just puny to
837 # lower is also insufficient, since some of the input characters
838 # are upper case.
Walter Dörwalda4c61282007-05-10 12:36:25 +0000839 self.assertEquals(
840 str(uni.encode("punycode"), "ascii").lower(),
841 str(puny, "ascii").lower()
842 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000843
844 def test_decode(self):
845 for uni, puny in punycode_testcases:
846 self.assertEquals(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000847 puny = puny.decode("ascii").encode("ascii")
848 self.assertEquals(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000849
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000850class UnicodeInternalTest(unittest.TestCase):
851 def test_bug1251300(self):
852 # Decoding with unicode_internal used to not correctly handle "code
853 # points" above 0x10ffff on UCS-4 builds.
854 if sys.maxunicode > 0xffff:
855 ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000856 (b"\x00\x10\xff\xff", "\U0010ffff"),
857 (b"\x00\x00\x01\x01", "\U00000101"),
858 (b"", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000859 ]
860 not_ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000861 b"\x7f\xff\xff\xff",
862 b"\x80\x00\x00\x00",
863 b"\x81\x00\x00\x00",
864 b"\x00",
865 b"\x00\x00\x00\x00\x00",
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000866 ]
867 for internal, uni in ok:
868 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000869 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000870 self.assertEquals(uni, internal.decode("unicode_internal"))
871 for internal in not_ok:
872 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000873 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000874 self.assertRaises(UnicodeDecodeError, internal.decode,
875 "unicode_internal")
876
877 def test_decode_error_attributes(self):
878 if sys.maxunicode > 0xffff:
879 try:
Walter Dörwald092a2252007-06-07 11:26:16 +0000880 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +0000881 except UnicodeDecodeError as ex:
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000882 self.assertEquals("unicode_internal", ex.encoding)
Walter Dörwald092a2252007-06-07 11:26:16 +0000883 self.assertEquals(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000884 self.assertEquals(4, ex.start)
885 self.assertEquals(8, ex.end)
886 else:
887 self.fail()
888
889 def test_decode_callback(self):
890 if sys.maxunicode > 0xffff:
891 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
892 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossum98297ee2007-11-06 21:34:58 +0000893 ab = "ab".encode("unicode_internal").decode()
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000894 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
895 "ascii"),
896 "UnicodeInternalTest")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000897 self.assertEquals(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000898
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000899 def test_encode_length(self):
900 # Issue 3739
901 encoder = codecs.getencoder("unicode_internal")
902 self.assertEquals(encoder("a")[1], 1)
903 self.assertEquals(encoder("\xe9\u0142")[1], 2)
904
Philip Jenvey66a1bd52010-04-05 03:05:24 +0000905 self.assertEquals(codecs.escape_encode(br'\x00')[1], 4)
906
Martin v. Löwis2548c732003-04-18 10:39:54 +0000907# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
908nameprep_tests = [
909 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000910 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
911 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
912 b'\xb8\x8f\xef\xbb\xbf',
913 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000914 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000915 (b'CAFE',
916 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000917 # 3.3 Case folding 8bit U+00DF (german sharp s).
918 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000919 (b'\xc3\x9f',
920 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000921 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000922 (b'\xc4\xb0',
923 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000924 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000925 (b'\xc5\x83\xcd\xba',
926 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000927 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
928 # XXX: skip this as it fails in UCS-2 mode
929 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
930 # 'telc\xe2\x88\x95kg\xcf\x83'),
931 (None, None),
932 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000933 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
934 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000935 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000936 (b'\xe1\xbe\xb7',
937 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000938 # 3.9 Self-reverting case folding U+01F0 and normalization.
939 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000940 (b'\xc7\xb0',
941 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000942 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000943 (b'\xce\x90',
944 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000945 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000946 (b'\xce\xb0',
947 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000948 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000949 (b'\xe1\xba\x96',
950 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000951 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000952 (b'\xe1\xbd\x96',
953 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000954 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000955 (b' ',
956 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000957 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000958 (b'\xc2\xa0',
959 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000960 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000961 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000962 None),
963 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000964 (b'\xe2\x80\x80',
965 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000966 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000967 (b'\xe2\x80\x8b',
968 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000969 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000970 (b'\xe3\x80\x80',
971 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000972 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000973 (b'\x10\x7f',
974 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000975 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000976 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000977 None),
978 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000979 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000980 None),
981 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000982 (b'\xef\xbb\xbf',
983 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000984 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000985 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000986 None),
987 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000988 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000989 None),
990 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000991 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000992 None),
993 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000994 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000995 None),
996 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000997 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000998 None),
999 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001000 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001001 None),
1002 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001003 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001004 None),
1005 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001006 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001007 None),
1008 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001009 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001010 None),
1011 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001012 (b'\xcd\x81',
1013 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001014 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001015 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001016 None),
1017 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001018 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001019 None),
1020 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001021 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001022 None),
1023 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001024 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001025 None),
1026 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001027 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001028 None),
1029 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001030 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001031 None),
1032 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001033 (b'foo\xef\xb9\xb6bar',
1034 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001035 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001036 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001037 None),
1038 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001039 (b'\xd8\xa71\xd8\xa8',
1040 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001041 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001042 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001043 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001044 # None),
1045 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001046 # 3.44 Larger test (shrinking).
1047 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001048 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1049 b'\xaa\xce\xb0\xe2\x80\x80',
1050 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001051 # 3.45 Larger test (expanding).
1052 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001053 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1054 b'\x80',
1055 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1056 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1057 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001058 ]
1059
1060
1061class NameprepTest(unittest.TestCase):
1062 def test_nameprep(self):
1063 from encodings.idna import nameprep
1064 for pos, (orig, prepped) in enumerate(nameprep_tests):
1065 if orig is None:
1066 # Skipped
1067 continue
1068 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001069 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001070 if prepped is None:
1071 # Input contains prohibited characters
1072 self.assertRaises(UnicodeError, nameprep, orig)
1073 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001074 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001075 try:
1076 self.assertEquals(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001077 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001078 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001079
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001080class IDNACodecTest(unittest.TestCase):
1081 def test_builtin_decode(self):
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001082 self.assertEquals(str(b"python.org", "idna"), "python.org")
1083 self.assertEquals(str(b"python.org.", "idna"), "python.org.")
1084 self.assertEquals(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1085 self.assertEquals(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001086
1087 def test_builtin_encode(self):
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001088 self.assertEquals("python.org".encode("idna"), b"python.org")
1089 self.assertEquals("python.org.".encode("idna"), b"python.org.")
1090 self.assertEquals("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1091 self.assertEquals("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001092
Martin v. Löwis8b595142005-08-25 11:03:38 +00001093 def test_stream(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001094 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001095 r.read(3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001096 self.assertEquals(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001097
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001098 def test_incremental_decode(self):
1099 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001100 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001101 "python.org"
1102 )
1103 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001104 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001105 "python.org."
1106 )
1107 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001108 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001109 "pyth\xf6n.org."
1110 )
1111 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001112 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001113 "pyth\xf6n.org."
1114 )
1115
1116 decoder = codecs.getincrementaldecoder("idna")()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001117 self.assertEquals(decoder.decode(b"xn--xam", ), "")
1118 self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1119 self.assertEquals(decoder.decode(b"rg"), "")
1120 self.assertEquals(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001121
1122 decoder.reset()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001123 self.assertEquals(decoder.decode(b"xn--xam", ), "")
1124 self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1125 self.assertEquals(decoder.decode(b"rg."), "org.")
1126 self.assertEquals(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001127
1128 def test_incremental_encode(self):
1129 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001130 b"".join(codecs.iterencode("python.org", "idna")),
1131 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001132 )
1133 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001134 b"".join(codecs.iterencode("python.org.", "idna")),
1135 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001136 )
1137 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001138 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1139 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001140 )
1141 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001142 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1143 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001144 )
1145
1146 encoder = codecs.getincrementalencoder("idna")()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001147 self.assertEquals(encoder.encode("\xe4x"), b"")
1148 self.assertEquals(encoder.encode("ample.org"), b"xn--xample-9ta.")
1149 self.assertEquals(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001150
1151 encoder.reset()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001152 self.assertEquals(encoder.encode("\xe4x"), b"")
1153 self.assertEquals(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1154 self.assertEquals(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001155
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001156class CodecsModuleTest(unittest.TestCase):
1157
1158 def test_decode(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001159 self.assertEquals(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001160 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001161 self.assertRaises(TypeError, codecs.decode)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001162 self.assertEquals(codecs.decode(b'abc'), 'abc')
1163 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001164
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001165 def test_encode(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001166 self.assertEquals(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001167 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001168 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001169 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001170 self.assertEquals(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001171 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001172
1173 def test_register(self):
1174 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001175 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001176
1177 def test_lookup(self):
1178 self.assertRaises(TypeError, codecs.lookup)
1179 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001180 self.assertRaises(LookupError, codecs.lookup, " ")
1181
1182 def test_getencoder(self):
1183 self.assertRaises(TypeError, codecs.getencoder)
1184 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1185
1186 def test_getdecoder(self):
1187 self.assertRaises(TypeError, codecs.getdecoder)
1188 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1189
1190 def test_getreader(self):
1191 self.assertRaises(TypeError, codecs.getreader)
1192 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1193
1194 def test_getwriter(self):
1195 self.assertRaises(TypeError, codecs.getwriter)
1196 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001197
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001198class StreamReaderTest(unittest.TestCase):
1199
1200 def setUp(self):
1201 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001202 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001203
1204 def test_readlines(self):
1205 f = self.reader(self.stream)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001206 self.assertEquals(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001207
Thomas Wouters89f507f2006-12-13 04:49:30 +00001208class EncodedFileTest(unittest.TestCase):
1209
1210 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001211 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001212 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001213 self.assertEquals(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001214
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001215 f = io.BytesIO()
Thomas Wouters89f507f2006-12-13 04:49:30 +00001216 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001217 ef.write(b'\xc3\xbc')
1218 self.assertEquals(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001219
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001220all_unicode_encodings = [
1221 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001222 "big5",
1223 "big5hkscs",
1224 "charmap",
1225 "cp037",
1226 "cp1006",
1227 "cp1026",
1228 "cp1140",
1229 "cp1250",
1230 "cp1251",
1231 "cp1252",
1232 "cp1253",
1233 "cp1254",
1234 "cp1255",
1235 "cp1256",
1236 "cp1257",
1237 "cp1258",
1238 "cp424",
1239 "cp437",
1240 "cp500",
1241 "cp737",
1242 "cp775",
1243 "cp850",
1244 "cp852",
1245 "cp855",
1246 "cp856",
1247 "cp857",
1248 "cp860",
1249 "cp861",
1250 "cp862",
1251 "cp863",
1252 "cp864",
1253 "cp865",
1254 "cp866",
1255 "cp869",
1256 "cp874",
1257 "cp875",
1258 "cp932",
1259 "cp949",
1260 "cp950",
1261 "euc_jis_2004",
1262 "euc_jisx0213",
1263 "euc_jp",
1264 "euc_kr",
1265 "gb18030",
1266 "gb2312",
1267 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001268 "hp_roman8",
1269 "hz",
1270 "idna",
1271 "iso2022_jp",
1272 "iso2022_jp_1",
1273 "iso2022_jp_2",
1274 "iso2022_jp_2004",
1275 "iso2022_jp_3",
1276 "iso2022_jp_ext",
1277 "iso2022_kr",
1278 "iso8859_1",
1279 "iso8859_10",
1280 "iso8859_11",
1281 "iso8859_13",
1282 "iso8859_14",
1283 "iso8859_15",
1284 "iso8859_16",
1285 "iso8859_2",
1286 "iso8859_3",
1287 "iso8859_4",
1288 "iso8859_5",
1289 "iso8859_6",
1290 "iso8859_7",
1291 "iso8859_8",
1292 "iso8859_9",
1293 "johab",
1294 "koi8_r",
1295 "koi8_u",
1296 "latin_1",
1297 "mac_cyrillic",
1298 "mac_greek",
1299 "mac_iceland",
1300 "mac_latin2",
1301 "mac_roman",
1302 "mac_turkish",
1303 "palmos",
1304 "ptcp154",
1305 "punycode",
1306 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001307 "shift_jis",
1308 "shift_jis_2004",
1309 "shift_jisx0213",
1310 "tis_620",
1311 "unicode_escape",
1312 "unicode_internal",
1313 "utf_16",
1314 "utf_16_be",
1315 "utf_16_le",
1316 "utf_7",
1317 "utf_8",
1318]
1319
1320if hasattr(codecs, "mbcs_encode"):
1321 all_unicode_encodings.append("mbcs")
1322
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001323# The following encoding is not tested, because it's not supposed
1324# to work:
1325# "undefined"
1326
1327# The following encodings don't work in stateful mode
1328broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001329 "punycode",
1330 "unicode_internal"
1331]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001332broken_incremental_coders = broken_unicode_with_streams + [
1333 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001334]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001335
1336# The following encodings only support "strict" mode
1337only_strict_mode = [
1338 "idna",
Thomas Wouters89f507f2006-12-13 04:49:30 +00001339]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001340
Walter Dörwald3abcb012007-04-16 22:10:50 +00001341class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001342 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001343 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001344 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001345 name = codecs.lookup(encoding).name
1346 if encoding.endswith("_codec"):
1347 name += "_codec"
1348 elif encoding == "latin_1":
1349 name = "latin_1"
1350 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001351 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001352 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001353 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001354 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1355
1356 if encoding not in broken_unicode_with_streams:
1357 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001358 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001359 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001360 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001361 for c in s:
1362 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001363 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001364 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001365 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001366 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001367 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001368 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001369 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001370 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001371 decodedresult += reader.read()
1372 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1373
Thomas Wouters89f507f2006-12-13 04:49:30 +00001374 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001375 # check incremental decoder/encoder (fetched via the Python
1376 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001377 try:
1378 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001379 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001380 except LookupError: # no IncrementalEncoder
1381 pass
1382 else:
1383 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001384 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001385 for c in s:
1386 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001387 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001388 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001389 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001390 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001391 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001392 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001393 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1394
1395 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001396 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001397 for c in s:
1398 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001399 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001400 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001401 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001402 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001403 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001404 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001405 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1406
1407 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001408 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001409 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1410
1411 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001412 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1413 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001414
Thomas Wouters89f507f2006-12-13 04:49:30 +00001415 if encoding not in only_strict_mode:
1416 # check incremental decoder/encoder with errors argument
1417 try:
1418 encoder = codecs.getincrementalencoder(encoding)("ignore")
1419 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1420 except LookupError: # no IncrementalEncoder
1421 pass
1422 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001423 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001424 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001425 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001426 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1427
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001428 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001429 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001430 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001431 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1432
Walter Dörwald729c31f2005-03-14 19:06:30 +00001433 def test_seek(self):
1434 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001435 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001436 for encoding in all_unicode_encodings:
1437 if encoding == "idna": # FIXME: See SF bug #1163178
1438 continue
1439 if encoding in broken_unicode_with_streams:
1440 continue
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001441 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001442 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001443 # Test that calling seek resets the internal codec state and buffers
1444 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001445 data = reader.read()
1446 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001447
Walter Dörwalde22d3392005-11-17 08:52:34 +00001448 def test_bad_decode_args(self):
1449 for encoding in all_unicode_encodings:
1450 decoder = codecs.getdecoder(encoding)
1451 self.assertRaises(TypeError, decoder)
1452 if encoding not in ("idna", "punycode"):
1453 self.assertRaises(TypeError, decoder, 42)
1454
1455 def test_bad_encode_args(self):
1456 for encoding in all_unicode_encodings:
1457 encoder = codecs.getencoder(encoding)
1458 self.assertRaises(TypeError, encoder)
1459
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001460 def test_encoding_map_type_initialized(self):
1461 from encodings import cp1140
1462 # This used to crash, we are only verifying there's no crash.
1463 table_type = type(cp1140.encoding_table)
1464 self.assertEqual(table_type, table_type)
1465
Walter Dörwald3abcb012007-04-16 22:10:50 +00001466 def test_decoder_state(self):
1467 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001468 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001469 for encoding in all_unicode_encodings:
1470 if encoding not in broken_incremental_coders:
1471 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1472 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1473
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001474class CharmapTest(unittest.TestCase):
1475 def test_decode_with_string_map(self):
1476 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001477 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001478 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001479 )
1480
1481 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001482 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001483 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001484 )
1485
1486 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001487 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001488 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001489 )
1490
1491 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001492 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001493 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001494 )
1495
1496 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001497 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001498 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001499 )
1500
Guido van Rossum805365e2007-05-07 22:24:25 +00001501 allbytes = bytes(range(256))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001502 self.assertEquals(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001503 codecs.charmap_decode(allbytes, "ignore", ""),
1504 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001505 )
1506
Thomas Wouters89f507f2006-12-13 04:49:30 +00001507class WithStmtTest(unittest.TestCase):
1508 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001509 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001510 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001511 self.assertEquals(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001512
1513 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001514 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001515 info = codecs.lookup("utf-8")
1516 with codecs.StreamReaderWriter(f, info.streamreader,
1517 info.streamwriter, 'strict') as srw:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001518 self.assertEquals(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001519
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001520class TypesTest(unittest.TestCase):
1521 def test_decode_unicode(self):
1522 # Most decoders don't accept unicode input
1523 decoders = [
1524 codecs.utf_7_decode,
1525 codecs.utf_8_decode,
1526 codecs.utf_16_le_decode,
1527 codecs.utf_16_be_decode,
1528 codecs.utf_16_ex_decode,
1529 codecs.utf_32_decode,
1530 codecs.utf_32_le_decode,
1531 codecs.utf_32_be_decode,
1532 codecs.utf_32_ex_decode,
1533 codecs.latin_1_decode,
1534 codecs.ascii_decode,
1535 codecs.charmap_decode,
1536 ]
1537 if hasattr(codecs, "mbcs_decode"):
1538 decoders.append(codecs.mbcs_decode)
1539 for decoder in decoders:
1540 self.assertRaises(TypeError, decoder, "xxx")
1541
1542 def test_unicode_escape(self):
1543 # Escape-decoding an unicode string is supported ang gives the same
1544 # result as decoding the equivalent ASCII bytes string.
1545 self.assertEquals(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1546 self.assertEquals(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1547 self.assertEquals(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1548 self.assertEquals(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1549
Martin v. Löwis43c57782009-05-10 08:15:24 +00001550class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001551
1552 def test_utf8(self):
1553 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001554 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001555 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001556 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001557 b"foo\x80bar")
1558 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001559 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001560 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001561 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001562 b"\xed\xb0\x80")
1563
1564 def test_ascii(self):
1565 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001566 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001567 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001568 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001569 b"foo\x80bar")
1570
1571 def test_charmap(self):
1572 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001573 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001574 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001575 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001576 b"foo\xa5bar")
1577
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001578 def test_latin1(self):
1579 # Issue6373
1580 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin1", "surrogateescape"),
1581 b"\xe4\xeb\xef\xf6\xfc")
1582
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001583
Victor Stinner3fed0872010-05-22 02:16:27 +00001584class BomTest(unittest.TestCase):
1585 def test_seek0(self):
1586 data = "1234567890"
1587 tests = ("utf-16",
1588 "utf-16-le",
1589 "utf-16-be",
1590 "utf-32",
1591 "utf-32-le",
1592 "utf-32-be")
1593 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001594 # Check if the BOM is written only once
1595 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001596 f.write(data)
1597 f.write(data)
1598 f.seek(0)
1599 self.assertEquals(f.read(), data * 2)
1600 f.seek(0)
1601 self.assertEquals(f.read(), data * 2)
1602
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001603 # Check that the BOM is written after a seek(0)
1604 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1605 f.write(data[0])
1606 self.assertNotEquals(f.tell(), 0)
1607 f.seek(0)
1608 f.write(data)
1609 f.seek(0)
1610 self.assertEquals(f.read(), data)
1611
1612 # (StreamWriter) Check that the BOM is written after a seek(0)
1613 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1614 f.writer.write(data[0])
1615 self.assertNotEquals(f.writer.tell(), 0)
1616 f.writer.seek(0)
1617 f.writer.write(data)
1618 f.seek(0)
1619 self.assertEquals(f.read(), data)
1620
1621 # Check that the BOM is not written after a seek() at a position
1622 # different than the start
1623 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1624 f.write(data)
1625 f.seek(f.tell())
1626 f.write(data)
1627 f.seek(0)
1628 self.assertEquals(f.read(), data * 2)
1629
1630 # (StreamWriter) Check that the BOM is not written after a seek()
1631 # at a position different than the start
1632 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1633 f.writer.write(data)
1634 f.writer.seek(f.writer.tell())
1635 f.writer.write(data)
1636 f.seek(0)
1637 self.assertEquals(f.read(), data * 2)
1638
Victor Stinner3fed0872010-05-22 02:16:27 +00001639
Fred Drake2e2be372001-09-20 21:33:42 +00001640def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001641 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00001642 UTF32Test,
1643 UTF32LETest,
1644 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001645 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001646 UTF16LETest,
1647 UTF16BETest,
1648 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001649 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001650 UTF7Test,
1651 UTF16ExTest,
1652 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001653 RecodingTest,
1654 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001655 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001656 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001657 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001658 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001659 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001660 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001661 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001662 CharmapTest,
1663 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001664 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001665 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00001666 BomTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001667 )
Fred Drake2e2be372001-09-20 21:33:42 +00001668
1669
1670if __name__ == "__main__":
1671 test_main()