blob: 6e7afc422b524b9079c75eb1cce8c2b8585967a9 [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Barry Warsaw04f357c2002-07-23 19:04:11 +00002import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00004import sys, _testcapi, io
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000010 def __init__(self, buffer):
11 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000012
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000019 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000020 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwald3abcb012007-04-16 22:10:50 +000026class MixInCheckStateHandling:
27 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000028 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000029 d = codecs.getincrementaldecoder(encoding)()
30 part1 = d.decode(s[:i])
31 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000032 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000033 # Check that the condition stated in the documentation for
34 # IncrementalDecoder.getstate() holds
35 if not state[1]:
36 # reset decoder to the default state without anything buffered
37 d.setstate((state[0][:0], 0))
38 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000039 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000040 # The decoder must return to the same state
41 self.assertEqual(state, d.getstate())
42 # Create a new decoder and set it to the state
43 # we extracted from the old one
44 d = codecs.getincrementaldecoder(encoding)()
45 d.setstate(state)
46 part2 = d.decode(s[i:], True)
47 self.assertEqual(u, part1+part2)
48
49 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000050 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000051 d = codecs.getincrementalencoder(encoding)()
52 part1 = d.encode(u[:i])
53 state = d.getstate()
54 d = codecs.getincrementalencoder(encoding)()
55 d.setstate(state)
56 part2 = d.encode(u[i:], True)
57 self.assertEqual(s, part1+part2)
58
59class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000060 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000061 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000062 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000063 # the StreamReader and check that the results equal the appropriate
64 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000065 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +000066 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000067 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000068 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000069 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000070 result += r.read()
71 self.assertEqual(result, partialresult)
72 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000073 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000074 self.assertEqual(r.bytebuffer, b"")
Guido van Rossumef87d6e2007-05-02 19:09:54 +000075 self.assertEqual(r.charbuffer, "")
Walter Dörwald69652032004-09-07 20:24:22 +000076
Thomas Woutersa9773292006-04-21 09:43:23 +000077 # do the check again, this time using a incremental decoder
78 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000079 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000080 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000081 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000082 self.assertEqual(result, partialresult)
83 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000084 self.assertEqual(d.decode(b"", True), "")
85 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000086
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000088 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000089 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000090 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000091 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000092 self.assertEqual(result, partialresult)
93 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000094 self.assertEqual(d.decode(b"", True), "")
95 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000096
97 # check iterdecode()
98 encoded = input.encode(self.encoding)
99 self.assertEqual(
100 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000102 )
103
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000104 def test_readline(self):
105 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000106 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000107 return codecs.getreader(self.encoding)(stream)
108
Walter Dörwaldca199432006-03-06 22:39:12 +0000109 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000110 reader = getreader(input)
111 lines = []
112 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000113 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000114 if not line:
115 break
116 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000117 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000118
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000119 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
120 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
121 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000122 self.assertEqual(readalllines(s, True), sexpected)
123 self.assertEqual(readalllines(s, False), sexpectednoends)
124 self.assertEqual(readalllines(s, True, 10), sexpected)
125 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000126
127 # Test long lines (multiple calls to read() in readline())
128 vw = []
129 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000130 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
131 vw.append((i*200)*"\3042" + lineend)
132 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000133 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
134 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
135
136 # Test lines where the first read might end with \r, so the
137 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000138 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000139 for lineend in "\n \r\n \r \u2028".split():
140 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000141 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000142 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000143 self.assertEqual(
144 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000145 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000146 )
147 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000148 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000149 self.assertEqual(
150 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000151 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000152 )
153
154 def test_bug1175396(self):
155 s = [
156 '<%!--===================================================\r\n',
157 ' BLOG index page: show recent articles,\r\n',
158 ' today\'s articles, or articles of a specific date.\r\n',
159 '========================================================--%>\r\n',
160 '<%@inputencoding="ISO-8859-1"%>\r\n',
161 '<%@pagetemplate=TEMPLATE.y%>\r\n',
162 '<%@import=import frog.util, frog%>\r\n',
163 '<%@import=import frog.objects%>\r\n',
164 '<%@import=from frog.storageerrors import StorageError%>\r\n',
165 '<%\r\n',
166 '\r\n',
167 'import logging\r\n',
168 'log=logging.getLogger("Snakelets.logger")\r\n',
169 '\r\n',
170 '\r\n',
171 'user=self.SessionCtx.user\r\n',
172 'storageEngine=self.SessionCtx.storageEngine\r\n',
173 '\r\n',
174 '\r\n',
175 'def readArticlesFromDate(date, count=None):\r\n',
176 ' entryids=storageEngine.listBlogEntries(date)\r\n',
177 ' entryids.reverse() # descending\r\n',
178 ' if count:\r\n',
179 ' entryids=entryids[:count]\r\n',
180 ' try:\r\n',
181 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
182 ' except StorageError,x:\r\n',
183 ' log.error("Error loading articles: "+str(x))\r\n',
184 ' self.abort("cannot load articles")\r\n',
185 '\r\n',
186 'showdate=None\r\n',
187 '\r\n',
188 'arg=self.Request.getArg()\r\n',
189 'if arg=="today":\r\n',
190 ' #-------------------- TODAY\'S ARTICLES\r\n',
191 ' self.write("<h2>Today\'s articles</h2>")\r\n',
192 ' showdate = frog.util.isodatestr() \r\n',
193 ' entries = readArticlesFromDate(showdate)\r\n',
194 'elif arg=="active":\r\n',
195 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
196 ' self.Yredirect("active.y")\r\n',
197 'elif arg=="login":\r\n',
198 ' #-------------------- LOGIN PAGE redirect\r\n',
199 ' self.Yredirect("login.y")\r\n',
200 'elif arg=="date":\r\n',
201 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
202 ' showdate = self.Request.getParameter("date")\r\n',
203 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
204 ' entries = readArticlesFromDate(showdate)\r\n',
205 'else:\r\n',
206 ' #-------------------- RECENT ARTICLES\r\n',
207 ' self.write("<h2>Recent articles</h2>")\r\n',
208 ' dates=storageEngine.listBlogEntryDates()\r\n',
209 ' if dates:\r\n',
210 ' entries=[]\r\n',
211 ' SHOWAMOUNT=10\r\n',
212 ' for showdate in dates:\r\n',
213 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
214 ' if len(entries)>=SHOWAMOUNT:\r\n',
215 ' break\r\n',
216 ' \r\n',
217 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000218 stream = io.BytesIO("".join(s).encode(self.encoding))
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000219 reader = codecs.getreader(self.encoding)(stream)
220 for (i, line) in enumerate(reader):
221 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000222
223 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000224 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000225 writer = codecs.getwriter(self.encoding)(q)
226 reader = codecs.getreader(self.encoding)(q)
227
228 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000229 writer.write("foo\r")
230 self.assertEqual(reader.readline(keepends=False), "foo")
231 writer.write("\nbar\r")
232 self.assertEqual(reader.readline(keepends=False), "")
233 self.assertEqual(reader.readline(keepends=False), "bar")
234 writer.write("baz")
235 self.assertEqual(reader.readline(keepends=False), "baz")
236 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000237
238 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000239 writer.write("foo\r")
240 self.assertEqual(reader.readline(keepends=True), "foo\r")
241 writer.write("\nbar\r")
242 self.assertEqual(reader.readline(keepends=True), "\n")
243 self.assertEqual(reader.readline(keepends=True), "bar\r")
244 writer.write("baz")
245 self.assertEqual(reader.readline(keepends=True), "baz")
246 self.assertEqual(reader.readline(keepends=True), "")
247 writer.write("foo\r\n")
248 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000249
Walter Dörwald9fa09462005-01-10 12:01:39 +0000250 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000251 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
252 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
253 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000254
255 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000256 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000257 reader = codecs.getreader(self.encoding)(stream)
258 self.assertEqual(reader.readline(), s1)
259 self.assertEqual(reader.readline(), s2)
260 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000261 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000262
263 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000264 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
265 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
266 s3 = "stillokay:bbbbxx\r\n"
267 s4 = "broken!!!!badbad\r\n"
268 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000269
270 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000271 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000272 reader = codecs.getreader(self.encoding)(stream)
273 self.assertEqual(reader.readline(), s1)
274 self.assertEqual(reader.readline(), s2)
275 self.assertEqual(reader.readline(), s3)
276 self.assertEqual(reader.readline(), s4)
277 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000278 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000279
Walter Dörwald41980ca2007-08-16 21:55:45 +0000280class UTF32Test(ReadTest):
281 encoding = "utf-32"
282
283 spamle = (b'\xff\xfe\x00\x00'
284 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
285 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
286 spambe = (b'\x00\x00\xfe\xff'
287 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
288 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
289
290 def test_only_one_bom(self):
291 _,_,reader,writer = codecs.lookup(self.encoding)
292 # encode some stream
293 s = io.BytesIO()
294 f = writer(s)
295 f.write("spam")
296 f.write("spam")
297 d = s.getvalue()
298 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000299 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000300 # try to read it back
301 s = io.BytesIO(d)
302 f = reader(s)
303 self.assertEquals(f.read(), "spamspam")
304
305 def test_badbom(self):
306 s = io.BytesIO(4*b"\xff")
307 f = codecs.getreader(self.encoding)(s)
308 self.assertRaises(UnicodeError, f.read)
309
310 s = io.BytesIO(8*b"\xff")
311 f = codecs.getreader(self.encoding)(s)
312 self.assertRaises(UnicodeError, f.read)
313
314 def test_partial(self):
315 self.check_partial(
316 "\x00\xff\u0100\uffff",
317 [
318 "", # first byte of BOM read
319 "", # second byte of BOM read
320 "", # third byte of BOM read
321 "", # fourth byte of BOM read => byteorder known
322 "",
323 "",
324 "",
325 "\x00",
326 "\x00",
327 "\x00",
328 "\x00",
329 "\x00\xff",
330 "\x00\xff",
331 "\x00\xff",
332 "\x00\xff",
333 "\x00\xff\u0100",
334 "\x00\xff\u0100",
335 "\x00\xff\u0100",
336 "\x00\xff\u0100",
337 "\x00\xff\u0100\uffff",
338 ]
339 )
340
Georg Brandl791f4e12009-09-17 11:41:24 +0000341 def test_handlers(self):
342 self.assertEqual(('\ufffd', 1),
343 codecs.utf_32_decode(b'\x01', 'replace', True))
344 self.assertEqual(('', 1),
345 codecs.utf_32_decode(b'\x01', 'ignore', True))
346
Walter Dörwald41980ca2007-08-16 21:55:45 +0000347 def test_errors(self):
348 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
349 b"\xff", "strict", True)
350
351 def test_decoder_state(self):
352 self.check_state_handling_decode(self.encoding,
353 "spamspam", self.spamle)
354 self.check_state_handling_decode(self.encoding,
355 "spamspam", self.spambe)
356
357class UTF32LETest(ReadTest):
358 encoding = "utf-32-le"
359
360 def test_partial(self):
361 self.check_partial(
362 "\x00\xff\u0100\uffff",
363 [
364 "",
365 "",
366 "",
367 "\x00",
368 "\x00",
369 "\x00",
370 "\x00",
371 "\x00\xff",
372 "\x00\xff",
373 "\x00\xff",
374 "\x00\xff",
375 "\x00\xff\u0100",
376 "\x00\xff\u0100",
377 "\x00\xff\u0100",
378 "\x00\xff\u0100",
379 "\x00\xff\u0100\uffff",
380 ]
381 )
382
383 def test_simple(self):
384 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
385
386 def test_errors(self):
387 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
388 b"\xff", "strict", True)
389
390class UTF32BETest(ReadTest):
391 encoding = "utf-32-be"
392
393 def test_partial(self):
394 self.check_partial(
395 "\x00\xff\u0100\uffff",
396 [
397 "",
398 "",
399 "",
400 "\x00",
401 "\x00",
402 "\x00",
403 "\x00",
404 "\x00\xff",
405 "\x00\xff",
406 "\x00\xff",
407 "\x00\xff",
408 "\x00\xff\u0100",
409 "\x00\xff\u0100",
410 "\x00\xff\u0100",
411 "\x00\xff\u0100",
412 "\x00\xff\u0100\uffff",
413 ]
414 )
415
416 def test_simple(self):
417 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
418
419 def test_errors(self):
420 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
421 b"\xff", "strict", True)
422
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000423class UTF16Test(ReadTest):
424 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000425
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000426 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
427 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000428
429 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000430 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000431 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000432 s = io.BytesIO()
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000433 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000434 f.write("spam")
435 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000436 d = s.getvalue()
437 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000438 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000439 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000440 s = io.BytesIO(d)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000441 f = reader(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000442 self.assertEquals(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000443
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000444 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000445 s = io.BytesIO(b"\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000446 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000447 self.assertRaises(UnicodeError, f.read)
448
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000449 s = io.BytesIO(b"\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000450 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000451 self.assertRaises(UnicodeError, f.read)
452
Walter Dörwald69652032004-09-07 20:24:22 +0000453 def test_partial(self):
454 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000455 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000456 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000457 "", # first byte of BOM read
458 "", # second byte of BOM read => byteorder known
459 "",
460 "\x00",
461 "\x00",
462 "\x00\xff",
463 "\x00\xff",
464 "\x00\xff\u0100",
465 "\x00\xff\u0100",
466 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000467 ]
468 )
469
Georg Brandl791f4e12009-09-17 11:41:24 +0000470 def test_handlers(self):
471 self.assertEqual(('\ufffd', 1),
472 codecs.utf_16_decode(b'\x01', 'replace', True))
473 self.assertEqual(('', 1),
474 codecs.utf_16_decode(b'\x01', 'ignore', True))
475
Walter Dörwalde22d3392005-11-17 08:52:34 +0000476 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000477 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000478 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000479
480 def test_decoder_state(self):
481 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000482 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000483 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000484 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000485
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000486 def test_bug691291(self):
487 # Files are always opened in binary mode, even if no binary mode was
488 # specified. This means that no automatic conversion of '\n' is done
489 # on reading and writing.
490 s1 = 'Hello\r\nworld\r\n'
491
492 s = s1.encode(self.encoding)
493 try:
494 with open(support.TESTFN, 'wb') as fp:
495 fp.write(s)
496 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
497 self.assertEqual(reader.read(), s1)
498 finally:
499 support.unlink(support.TESTFN)
500
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000501class UTF16LETest(ReadTest):
502 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000503
504 def test_partial(self):
505 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000506 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000507 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000508 "",
509 "\x00",
510 "\x00",
511 "\x00\xff",
512 "\x00\xff",
513 "\x00\xff\u0100",
514 "\x00\xff\u0100",
515 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000516 ]
517 )
518
Walter Dörwalde22d3392005-11-17 08:52:34 +0000519 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000520 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000521 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000522
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000523class UTF16BETest(ReadTest):
524 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000525
526 def test_partial(self):
527 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000528 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000529 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000530 "",
531 "\x00",
532 "\x00",
533 "\x00\xff",
534 "\x00\xff",
535 "\x00\xff\u0100",
536 "\x00\xff\u0100",
537 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000538 ]
539 )
540
Walter Dörwalde22d3392005-11-17 08:52:34 +0000541 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000542 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000543 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000544
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000545class UTF8Test(ReadTest):
546 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000547
548 def test_partial(self):
549 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000550 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000551 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000552 "\x00",
553 "\x00",
554 "\x00\xff",
555 "\x00\xff",
556 "\x00\xff\u07ff",
557 "\x00\xff\u07ff",
558 "\x00\xff\u07ff",
559 "\x00\xff\u07ff\u0800",
560 "\x00\xff\u07ff\u0800",
561 "\x00\xff\u07ff\u0800",
562 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000563 ]
564 )
565
Walter Dörwald3abcb012007-04-16 22:10:50 +0000566 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000567 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000568 self.check_state_handling_decode(self.encoding,
569 u, u.encode(self.encoding))
570
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000571 def test_lone_surrogates(self):
572 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
573 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000574 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
575 b'[\\udc80]')
576 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
577 b'[&#56448;]')
578 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
579 b'[\x80]')
580 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
581 b'[]')
582 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
583 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000584
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000585 def test_surrogatepass_handler(self):
586 self.assertEquals("abc\ud800def".encode("utf-8", "surrogatepass"),
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000587 b"abc\xed\xa0\x80def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000588 self.assertEquals(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000589 "abc\ud800def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000590 self.assertTrue(codecs.lookup_error("surrogatepass"))
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000591
Walter Dörwalde22d3392005-11-17 08:52:34 +0000592class UTF7Test(ReadTest):
593 encoding = "utf-7"
594
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000595 def test_partial(self):
596 self.check_partial(
597 "a+-b",
598 [
599 "a",
600 "a",
601 "a+",
602 "a+-",
603 "a+-b",
604 ]
605 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000606
607class UTF16ExTest(unittest.TestCase):
608
609 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000610 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000611
612 def test_bad_args(self):
613 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
614
615class ReadBufferTest(unittest.TestCase):
616
617 def test_array(self):
618 import array
619 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000620 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000621 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000622 )
623
624 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000625 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000626
627 def test_bad_args(self):
628 self.assertRaises(TypeError, codecs.readbuffer_encode)
629 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
630
631class CharBufferTest(unittest.TestCase):
632
633 def test_string(self):
Guido van Rossum09549f42007-08-27 20:40:10 +0000634 self.assertEqual(codecs.charbuffer_encode(b"spam"), (b"spam", 4))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000635
636 def test_empty(self):
Guido van Rossum09549f42007-08-27 20:40:10 +0000637 self.assertEqual(codecs.charbuffer_encode(b""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000638
639 def test_bad_args(self):
640 self.assertRaises(TypeError, codecs.charbuffer_encode)
641 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
642
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000643class UTF8SigTest(ReadTest):
644 encoding = "utf-8-sig"
645
646 def test_partial(self):
647 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000648 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000649 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000650 "",
651 "",
652 "", # First BOM has been read and skipped
653 "",
654 "",
655 "\ufeff", # Second BOM has been read and emitted
656 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000657 "\ufeff\x00", # First byte of encoded "\xff" read
658 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
659 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
660 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000661 "\ufeff\x00\xff\u07ff",
662 "\ufeff\x00\xff\u07ff",
663 "\ufeff\x00\xff\u07ff\u0800",
664 "\ufeff\x00\xff\u07ff\u0800",
665 "\ufeff\x00\xff\u07ff\u0800",
666 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000667 ]
668 )
669
Thomas Wouters89f507f2006-12-13 04:49:30 +0000670 def test_bug1601501(self):
671 # SF bug #1601501: check that the codec works with a buffer
Antoine Pitrou616d2852008-08-19 22:09:34 +0000672 self.assertEquals(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000673
Walter Dörwald3abcb012007-04-16 22:10:50 +0000674 def test_bom(self):
675 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000676 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000677 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
678
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000679 def test_stream_bom(self):
680 unistring = "ABC\u00A1\u2200XYZ"
681 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
682
683 reader = codecs.getreader("utf-8-sig")
684 for sizehint in [None] + list(range(1, 11)) + \
685 [64, 128, 256, 512, 1024]:
686 istream = reader(io.BytesIO(bytestring))
687 ostream = io.StringIO()
688 while 1:
689 if sizehint is not None:
690 data = istream.read(sizehint)
691 else:
692 data = istream.read()
693
694 if not data:
695 break
696 ostream.write(data)
697
698 got = ostream.getvalue()
699 self.assertEqual(got, unistring)
700
701 def test_stream_bare(self):
702 unistring = "ABC\u00A1\u2200XYZ"
703 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
704
705 reader = codecs.getreader("utf-8-sig")
706 for sizehint in [None] + list(range(1, 11)) + \
707 [64, 128, 256, 512, 1024]:
708 istream = reader(io.BytesIO(bytestring))
709 ostream = io.StringIO()
710 while 1:
711 if sizehint is not None:
712 data = istream.read(sizehint)
713 else:
714 data = istream.read()
715
716 if not data:
717 break
718 ostream.write(data)
719
720 got = ostream.getvalue()
721 self.assertEqual(got, unistring)
722
723class EscapeDecodeTest(unittest.TestCase):
724 def test_empty(self):
725 self.assertEquals(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000726
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000727class RecodingTest(unittest.TestCase):
728 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000729 f = io.BytesIO()
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000730 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000731 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000732 f2.close()
733 # Python used to crash on this at exit because of a refcount
734 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000735
Martin v. Löwis2548c732003-04-18 10:39:54 +0000736# From RFC 3492
737punycode_testcases = [
738 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000739 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
740 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000741 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000742 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000743 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000744 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000745 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000746 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000747 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000748 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000749 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
750 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
751 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000752 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000753 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000754 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
755 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
756 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000757 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000758 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000759 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000760 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
761 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
762 "\u0939\u0948\u0902",
763 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000764
765 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000766 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000767 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
768 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000769
770 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000771 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
772 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
773 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000774 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
775 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000776
777 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000778 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
779 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
780 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
781 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000782 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000783
784 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000785 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
786 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
787 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
788 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
789 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000790 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000791
792 # (K) Vietnamese:
793 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
794 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000795 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
796 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
797 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
798 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000799 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000800
Martin v. Löwis2548c732003-04-18 10:39:54 +0000801 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000802 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000803 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000804
Martin v. Löwis2548c732003-04-18 10:39:54 +0000805 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000806 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
807 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
808 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000809 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000810
811 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000812 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
813 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
814 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000815 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000816
817 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000818 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000819 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000820
821 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000822 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
823 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000824 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000825
826 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000827 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000828 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000829
830 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000831 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000832 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000833
834 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000835 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
836 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000837 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000838 ]
839
840for i in punycode_testcases:
841 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000842 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000843
844class PunycodeTest(unittest.TestCase):
845 def test_encode(self):
846 for uni, puny in punycode_testcases:
847 # Need to convert both strings to lower case, since
848 # some of the extended encodings use upper case, but our
849 # code produces only lower case. Converting just puny to
850 # lower is also insufficient, since some of the input characters
851 # are upper case.
Walter Dörwalda4c61282007-05-10 12:36:25 +0000852 self.assertEquals(
853 str(uni.encode("punycode"), "ascii").lower(),
854 str(puny, "ascii").lower()
855 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000856
857 def test_decode(self):
858 for uni, puny in punycode_testcases:
859 self.assertEquals(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000860 puny = puny.decode("ascii").encode("ascii")
861 self.assertEquals(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000862
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000863class UnicodeInternalTest(unittest.TestCase):
864 def test_bug1251300(self):
865 # Decoding with unicode_internal used to not correctly handle "code
866 # points" above 0x10ffff on UCS-4 builds.
867 if sys.maxunicode > 0xffff:
868 ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000869 (b"\x00\x10\xff\xff", "\U0010ffff"),
870 (b"\x00\x00\x01\x01", "\U00000101"),
871 (b"", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000872 ]
873 not_ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000874 b"\x7f\xff\xff\xff",
875 b"\x80\x00\x00\x00",
876 b"\x81\x00\x00\x00",
877 b"\x00",
878 b"\x00\x00\x00\x00\x00",
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000879 ]
880 for internal, uni in ok:
881 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000882 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000883 self.assertEquals(uni, internal.decode("unicode_internal"))
884 for internal in not_ok:
885 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000886 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000887 self.assertRaises(UnicodeDecodeError, internal.decode,
888 "unicode_internal")
889
890 def test_decode_error_attributes(self):
891 if sys.maxunicode > 0xffff:
892 try:
Walter Dörwald092a2252007-06-07 11:26:16 +0000893 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +0000894 except UnicodeDecodeError as ex:
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000895 self.assertEquals("unicode_internal", ex.encoding)
Walter Dörwald092a2252007-06-07 11:26:16 +0000896 self.assertEquals(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000897 self.assertEquals(4, ex.start)
898 self.assertEquals(8, ex.end)
899 else:
900 self.fail()
901
902 def test_decode_callback(self):
903 if sys.maxunicode > 0xffff:
904 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
905 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossum98297ee2007-11-06 21:34:58 +0000906 ab = "ab".encode("unicode_internal").decode()
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000907 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
908 "ascii"),
909 "UnicodeInternalTest")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000910 self.assertEquals(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000911
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000912 def test_encode_length(self):
913 # Issue 3739
914 encoder = codecs.getencoder("unicode_internal")
915 self.assertEquals(encoder("a")[1], 1)
916 self.assertEquals(encoder("\xe9\u0142")[1], 2)
917
Philip Jenvey66a1bd52010-04-05 03:05:24 +0000918 self.assertEquals(codecs.escape_encode(br'\x00')[1], 4)
919
Martin v. Löwis2548c732003-04-18 10:39:54 +0000920# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
921nameprep_tests = [
922 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000923 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
924 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
925 b'\xb8\x8f\xef\xbb\xbf',
926 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000927 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000928 (b'CAFE',
929 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000930 # 3.3 Case folding 8bit U+00DF (german sharp s).
931 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000932 (b'\xc3\x9f',
933 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000934 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000935 (b'\xc4\xb0',
936 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000937 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000938 (b'\xc5\x83\xcd\xba',
939 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000940 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
941 # XXX: skip this as it fails in UCS-2 mode
942 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
943 # 'telc\xe2\x88\x95kg\xcf\x83'),
944 (None, None),
945 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000946 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
947 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000948 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000949 (b'\xe1\xbe\xb7',
950 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000951 # 3.9 Self-reverting case folding U+01F0 and normalization.
952 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000953 (b'\xc7\xb0',
954 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000955 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000956 (b'\xce\x90',
957 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000958 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000959 (b'\xce\xb0',
960 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000961 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000962 (b'\xe1\xba\x96',
963 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000964 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000965 (b'\xe1\xbd\x96',
966 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000967 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000968 (b' ',
969 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000970 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000971 (b'\xc2\xa0',
972 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000973 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000974 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000975 None),
976 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000977 (b'\xe2\x80\x80',
978 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000979 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000980 (b'\xe2\x80\x8b',
981 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000982 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000983 (b'\xe3\x80\x80',
984 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000985 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000986 (b'\x10\x7f',
987 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000988 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000989 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000990 None),
991 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000992 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000993 None),
994 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000995 (b'\xef\xbb\xbf',
996 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000997 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000998 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000999 None),
1000 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001001 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001002 None),
1003 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001004 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001005 None),
1006 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001007 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001008 None),
1009 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001010 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001011 None),
1012 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001013 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001014 None),
1015 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001016 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001017 None),
1018 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001019 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001020 None),
1021 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001022 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001023 None),
1024 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001025 (b'\xcd\x81',
1026 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001027 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001028 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001029 None),
1030 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001031 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001032 None),
1033 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001034 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001035 None),
1036 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001037 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001038 None),
1039 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001040 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001041 None),
1042 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001043 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001044 None),
1045 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001046 (b'foo\xef\xb9\xb6bar',
1047 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001048 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001049 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001050 None),
1051 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001052 (b'\xd8\xa71\xd8\xa8',
1053 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001054 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001055 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001056 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001057 # None),
1058 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001059 # 3.44 Larger test (shrinking).
1060 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001061 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1062 b'\xaa\xce\xb0\xe2\x80\x80',
1063 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001064 # 3.45 Larger test (expanding).
1065 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001066 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1067 b'\x80',
1068 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1069 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1070 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001071 ]
1072
1073
1074class NameprepTest(unittest.TestCase):
1075 def test_nameprep(self):
1076 from encodings.idna import nameprep
1077 for pos, (orig, prepped) in enumerate(nameprep_tests):
1078 if orig is None:
1079 # Skipped
1080 continue
1081 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001082 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001083 if prepped is None:
1084 # Input contains prohibited characters
1085 self.assertRaises(UnicodeError, nameprep, orig)
1086 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001087 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001088 try:
1089 self.assertEquals(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001090 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001091 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001092
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001093class IDNACodecTest(unittest.TestCase):
1094 def test_builtin_decode(self):
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001095 self.assertEquals(str(b"python.org", "idna"), "python.org")
1096 self.assertEquals(str(b"python.org.", "idna"), "python.org.")
1097 self.assertEquals(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1098 self.assertEquals(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001099
1100 def test_builtin_encode(self):
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001101 self.assertEquals("python.org".encode("idna"), b"python.org")
1102 self.assertEquals("python.org.".encode("idna"), b"python.org.")
1103 self.assertEquals("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1104 self.assertEquals("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001105
Martin v. Löwis8b595142005-08-25 11:03:38 +00001106 def test_stream(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001107 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001108 r.read(3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001109 self.assertEquals(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001110
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001111 def test_incremental_decode(self):
1112 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001113 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001114 "python.org"
1115 )
1116 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001117 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001118 "python.org."
1119 )
1120 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001121 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001122 "pyth\xf6n.org."
1123 )
1124 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001125 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001126 "pyth\xf6n.org."
1127 )
1128
1129 decoder = codecs.getincrementaldecoder("idna")()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001130 self.assertEquals(decoder.decode(b"xn--xam", ), "")
1131 self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1132 self.assertEquals(decoder.decode(b"rg"), "")
1133 self.assertEquals(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001134
1135 decoder.reset()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001136 self.assertEquals(decoder.decode(b"xn--xam", ), "")
1137 self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1138 self.assertEquals(decoder.decode(b"rg."), "org.")
1139 self.assertEquals(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001140
1141 def test_incremental_encode(self):
1142 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001143 b"".join(codecs.iterencode("python.org", "idna")),
1144 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001145 )
1146 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001147 b"".join(codecs.iterencode("python.org.", "idna")),
1148 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001149 )
1150 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001151 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1152 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001153 )
1154 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001155 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1156 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001157 )
1158
1159 encoder = codecs.getincrementalencoder("idna")()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001160 self.assertEquals(encoder.encode("\xe4x"), b"")
1161 self.assertEquals(encoder.encode("ample.org"), b"xn--xample-9ta.")
1162 self.assertEquals(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001163
1164 encoder.reset()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001165 self.assertEquals(encoder.encode("\xe4x"), b"")
1166 self.assertEquals(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1167 self.assertEquals(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001168
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001169class CodecsModuleTest(unittest.TestCase):
1170
1171 def test_decode(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001172 self.assertEquals(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001173 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001174 self.assertRaises(TypeError, codecs.decode)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001175 self.assertEquals(codecs.decode(b'abc'), 'abc')
1176 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001177
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001178 def test_encode(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001179 self.assertEquals(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001180 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001181 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001182 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001183 self.assertEquals(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001184 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001185
1186 def test_register(self):
1187 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001188 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001189
1190 def test_lookup(self):
1191 self.assertRaises(TypeError, codecs.lookup)
1192 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001193 self.assertRaises(LookupError, codecs.lookup, " ")
1194
1195 def test_getencoder(self):
1196 self.assertRaises(TypeError, codecs.getencoder)
1197 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1198
1199 def test_getdecoder(self):
1200 self.assertRaises(TypeError, codecs.getdecoder)
1201 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1202
1203 def test_getreader(self):
1204 self.assertRaises(TypeError, codecs.getreader)
1205 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1206
1207 def test_getwriter(self):
1208 self.assertRaises(TypeError, codecs.getwriter)
1209 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001210
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001211class StreamReaderTest(unittest.TestCase):
1212
1213 def setUp(self):
1214 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001215 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001216
1217 def test_readlines(self):
1218 f = self.reader(self.stream)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001219 self.assertEquals(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001220
Thomas Wouters89f507f2006-12-13 04:49:30 +00001221class EncodedFileTest(unittest.TestCase):
1222
1223 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001224 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001225 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001226 self.assertEquals(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001227
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001228 f = io.BytesIO()
Thomas Wouters89f507f2006-12-13 04:49:30 +00001229 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001230 ef.write(b'\xc3\xbc')
1231 self.assertEquals(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001232
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001233all_unicode_encodings = [
1234 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001235 "big5",
1236 "big5hkscs",
1237 "charmap",
1238 "cp037",
1239 "cp1006",
1240 "cp1026",
1241 "cp1140",
1242 "cp1250",
1243 "cp1251",
1244 "cp1252",
1245 "cp1253",
1246 "cp1254",
1247 "cp1255",
1248 "cp1256",
1249 "cp1257",
1250 "cp1258",
1251 "cp424",
1252 "cp437",
1253 "cp500",
1254 "cp737",
1255 "cp775",
1256 "cp850",
1257 "cp852",
1258 "cp855",
1259 "cp856",
1260 "cp857",
1261 "cp860",
1262 "cp861",
1263 "cp862",
1264 "cp863",
1265 "cp864",
1266 "cp865",
1267 "cp866",
1268 "cp869",
1269 "cp874",
1270 "cp875",
1271 "cp932",
1272 "cp949",
1273 "cp950",
1274 "euc_jis_2004",
1275 "euc_jisx0213",
1276 "euc_jp",
1277 "euc_kr",
1278 "gb18030",
1279 "gb2312",
1280 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001281 "hp_roman8",
1282 "hz",
1283 "idna",
1284 "iso2022_jp",
1285 "iso2022_jp_1",
1286 "iso2022_jp_2",
1287 "iso2022_jp_2004",
1288 "iso2022_jp_3",
1289 "iso2022_jp_ext",
1290 "iso2022_kr",
1291 "iso8859_1",
1292 "iso8859_10",
1293 "iso8859_11",
1294 "iso8859_13",
1295 "iso8859_14",
1296 "iso8859_15",
1297 "iso8859_16",
1298 "iso8859_2",
1299 "iso8859_3",
1300 "iso8859_4",
1301 "iso8859_5",
1302 "iso8859_6",
1303 "iso8859_7",
1304 "iso8859_8",
1305 "iso8859_9",
1306 "johab",
1307 "koi8_r",
1308 "koi8_u",
1309 "latin_1",
1310 "mac_cyrillic",
1311 "mac_greek",
1312 "mac_iceland",
1313 "mac_latin2",
1314 "mac_roman",
1315 "mac_turkish",
1316 "palmos",
1317 "ptcp154",
1318 "punycode",
1319 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001320 "shift_jis",
1321 "shift_jis_2004",
1322 "shift_jisx0213",
1323 "tis_620",
1324 "unicode_escape",
1325 "unicode_internal",
1326 "utf_16",
1327 "utf_16_be",
1328 "utf_16_le",
1329 "utf_7",
1330 "utf_8",
1331]
1332
1333if hasattr(codecs, "mbcs_encode"):
1334 all_unicode_encodings.append("mbcs")
1335
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001336# The following encoding is not tested, because it's not supposed
1337# to work:
1338# "undefined"
1339
1340# The following encodings don't work in stateful mode
1341broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001342 "punycode",
1343 "unicode_internal"
1344]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001345broken_incremental_coders = broken_unicode_with_streams + [
1346 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001347]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001348
1349# The following encodings only support "strict" mode
1350only_strict_mode = [
1351 "idna",
Thomas Wouters89f507f2006-12-13 04:49:30 +00001352]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001353
Walter Dörwald3abcb012007-04-16 22:10:50 +00001354class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001355 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001356 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001357 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001358 name = codecs.lookup(encoding).name
1359 if encoding.endswith("_codec"):
1360 name += "_codec"
1361 elif encoding == "latin_1":
1362 name = "latin_1"
1363 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001364 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001365 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001366 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001367 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1368
1369 if encoding not in broken_unicode_with_streams:
1370 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001371 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001372 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001373 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001374 for c in s:
1375 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001376 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001377 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001378 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001379 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001380 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001381 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001382 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001383 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001384 decodedresult += reader.read()
1385 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1386
Thomas Wouters89f507f2006-12-13 04:49:30 +00001387 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001388 # check incremental decoder/encoder (fetched via the Python
1389 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001390 try:
1391 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001392 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001393 except LookupError: # no IncrementalEncoder
1394 pass
1395 else:
1396 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001397 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001398 for c in s:
1399 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001400 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001401 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001402 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001403 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001404 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001405 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001406 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1407
1408 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001409 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001410 for c in s:
1411 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001412 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001413 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001414 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001415 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001416 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001417 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001418 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1419
1420 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001421 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001422 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1423
1424 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001425 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1426 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001427
Thomas Wouters89f507f2006-12-13 04:49:30 +00001428 if encoding not in only_strict_mode:
1429 # check incremental decoder/encoder with errors argument
1430 try:
1431 encoder = codecs.getincrementalencoder(encoding)("ignore")
1432 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1433 except LookupError: # no IncrementalEncoder
1434 pass
1435 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001436 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001437 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001438 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001439 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1440
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001441 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001442 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001443 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001444 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1445
Walter Dörwald729c31f2005-03-14 19:06:30 +00001446 def test_seek(self):
1447 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001448 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001449 for encoding in all_unicode_encodings:
1450 if encoding == "idna": # FIXME: See SF bug #1163178
1451 continue
1452 if encoding in broken_unicode_with_streams:
1453 continue
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001454 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001455 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001456 # Test that calling seek resets the internal codec state and buffers
1457 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001458 data = reader.read()
1459 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001460
Walter Dörwalde22d3392005-11-17 08:52:34 +00001461 def test_bad_decode_args(self):
1462 for encoding in all_unicode_encodings:
1463 decoder = codecs.getdecoder(encoding)
1464 self.assertRaises(TypeError, decoder)
1465 if encoding not in ("idna", "punycode"):
1466 self.assertRaises(TypeError, decoder, 42)
1467
1468 def test_bad_encode_args(self):
1469 for encoding in all_unicode_encodings:
1470 encoder = codecs.getencoder(encoding)
1471 self.assertRaises(TypeError, encoder)
1472
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001473 def test_encoding_map_type_initialized(self):
1474 from encodings import cp1140
1475 # This used to crash, we are only verifying there's no crash.
1476 table_type = type(cp1140.encoding_table)
1477 self.assertEqual(table_type, table_type)
1478
Walter Dörwald3abcb012007-04-16 22:10:50 +00001479 def test_decoder_state(self):
1480 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001481 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001482 for encoding in all_unicode_encodings:
1483 if encoding not in broken_incremental_coders:
1484 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1485 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1486
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001487class CharmapTest(unittest.TestCase):
1488 def test_decode_with_string_map(self):
1489 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001490 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001491 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001492 )
1493
1494 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001495 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001496 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001497 )
1498
1499 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001500 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001501 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001502 )
1503
1504 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001505 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001506 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001507 )
1508
1509 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001510 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001511 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001512 )
1513
Guido van Rossum805365e2007-05-07 22:24:25 +00001514 allbytes = bytes(range(256))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001515 self.assertEquals(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001516 codecs.charmap_decode(allbytes, "ignore", ""),
1517 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001518 )
1519
Thomas Wouters89f507f2006-12-13 04:49:30 +00001520class WithStmtTest(unittest.TestCase):
1521 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001522 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001523 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001524 self.assertEquals(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001525
1526 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001527 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001528 info = codecs.lookup("utf-8")
1529 with codecs.StreamReaderWriter(f, info.streamreader,
1530 info.streamwriter, 'strict') as srw:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001531 self.assertEquals(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001532
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001533class TypesTest(unittest.TestCase):
1534 def test_decode_unicode(self):
1535 # Most decoders don't accept unicode input
1536 decoders = [
1537 codecs.utf_7_decode,
1538 codecs.utf_8_decode,
1539 codecs.utf_16_le_decode,
1540 codecs.utf_16_be_decode,
1541 codecs.utf_16_ex_decode,
1542 codecs.utf_32_decode,
1543 codecs.utf_32_le_decode,
1544 codecs.utf_32_be_decode,
1545 codecs.utf_32_ex_decode,
1546 codecs.latin_1_decode,
1547 codecs.ascii_decode,
1548 codecs.charmap_decode,
1549 ]
1550 if hasattr(codecs, "mbcs_decode"):
1551 decoders.append(codecs.mbcs_decode)
1552 for decoder in decoders:
1553 self.assertRaises(TypeError, decoder, "xxx")
1554
1555 def test_unicode_escape(self):
1556 # Escape-decoding an unicode string is supported ang gives the same
1557 # result as decoding the equivalent ASCII bytes string.
1558 self.assertEquals(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1559 self.assertEquals(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1560 self.assertEquals(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1561 self.assertEquals(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1562
Martin v. Löwis43c57782009-05-10 08:15:24 +00001563class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001564
1565 def test_utf8(self):
1566 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001567 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001568 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001569 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001570 b"foo\x80bar")
1571 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001572 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001573 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001574 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001575 b"\xed\xb0\x80")
1576
1577 def test_ascii(self):
1578 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001579 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001580 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001581 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001582 b"foo\x80bar")
1583
1584 def test_charmap(self):
1585 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001586 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001587 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001588 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001589 b"foo\xa5bar")
1590
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001591 def test_latin1(self):
1592 # Issue6373
1593 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin1", "surrogateescape"),
1594 b"\xe4\xeb\xef\xf6\xfc")
1595
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001596
Victor Stinner3fed0872010-05-22 02:16:27 +00001597class BomTest(unittest.TestCase):
1598 def test_seek0(self):
1599 data = "1234567890"
1600 tests = ("utf-16",
1601 "utf-16-le",
1602 "utf-16-be",
1603 "utf-32",
1604 "utf-32-le",
1605 "utf-32-be")
1606 for encoding in tests:
1607 with codecs.open('foo', 'w+', encoding=encoding) as f:
1608 # Check if the BOM is written only once
1609 f.write(data)
1610 f.write(data)
1611 f.seek(0)
1612 self.assertEquals(f.read(), data * 2)
1613 f.seek(0)
1614 self.assertEquals(f.read(), data * 2)
1615
1616
Fred Drake2e2be372001-09-20 21:33:42 +00001617def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001618 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00001619 UTF32Test,
1620 UTF32LETest,
1621 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001622 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001623 UTF16LETest,
1624 UTF16BETest,
1625 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001626 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001627 UTF7Test,
1628 UTF16ExTest,
1629 ReadBufferTest,
1630 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001631 RecodingTest,
1632 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001633 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001634 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001635 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001636 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001637 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001638 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001639 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001640 CharmapTest,
1641 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001642 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001643 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00001644 BomTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001645 )
Fred Drake2e2be372001-09-20 21:33:42 +00001646
1647
1648if __name__ == "__main__":
1649 test_main()