blob: e412a6444e2b3cf2716363aba71c8477eb0e6ab1 [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Barry Warsaw04f357c2002-07-23 19:04:11 +00002import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00004import sys, _testcapi, io
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000010 def __init__(self, buffer):
11 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000012
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000019 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000020 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwald3abcb012007-04-16 22:10:50 +000026class MixInCheckStateHandling:
27 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000028 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000029 d = codecs.getincrementaldecoder(encoding)()
30 part1 = d.decode(s[:i])
31 state = d.getstate()
Georg Brandlab91fde2009-08-13 08:51:18 +000032 self.assertTrue(isinstance(state[1], int))
Walter Dörwald3abcb012007-04-16 22:10:50 +000033 # Check that the condition stated in the documentation for
34 # IncrementalDecoder.getstate() holds
35 if not state[1]:
36 # reset decoder to the default state without anything buffered
37 d.setstate((state[0][:0], 0))
38 # Feeding the previous input may not produce any output
Georg Brandlab91fde2009-08-13 08:51:18 +000039 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000040 # The decoder must return to the same state
41 self.assertEqual(state, d.getstate())
42 # Create a new decoder and set it to the state
43 # we extracted from the old one
44 d = codecs.getincrementaldecoder(encoding)()
45 d.setstate(state)
46 part2 = d.decode(s[i:], True)
47 self.assertEqual(u, part1+part2)
48
49 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000050 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000051 d = codecs.getincrementalencoder(encoding)()
52 part1 = d.encode(u[:i])
53 state = d.getstate()
54 d = codecs.getincrementalencoder(encoding)()
55 d.setstate(state)
56 part2 = d.encode(u[i:], True)
57 self.assertEqual(s, part1+part2)
58
59class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000060 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000061 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000062 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000063 # the StreamReader and check that the results equal the appropriate
64 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000065 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +000066 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000067 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000068 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000069 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000070 result += r.read()
71 self.assertEqual(result, partialresult)
72 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000073 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000074 self.assertEqual(r.bytebuffer, b"")
Guido van Rossumef87d6e2007-05-02 19:09:54 +000075 self.assertEqual(r.charbuffer, "")
Walter Dörwald69652032004-09-07 20:24:22 +000076
Thomas Woutersa9773292006-04-21 09:43:23 +000077 # do the check again, this time using a incremental decoder
78 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000079 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000080 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000081 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000082 self.assertEqual(result, partialresult)
83 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000084 self.assertEqual(d.decode(b"", True), "")
85 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000086
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000088 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000089 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000090 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000091 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000092 self.assertEqual(result, partialresult)
93 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000094 self.assertEqual(d.decode(b"", True), "")
95 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000096
97 # check iterdecode()
98 encoded = input.encode(self.encoding)
99 self.assertEqual(
100 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000102 )
103
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000104 def test_readline(self):
105 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000106 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000107 return codecs.getreader(self.encoding)(stream)
108
Walter Dörwaldca199432006-03-06 22:39:12 +0000109 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000110 reader = getreader(input)
111 lines = []
112 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000113 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000114 if not line:
115 break
116 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000117 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000118
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000119 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
120 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
121 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000122 self.assertEqual(readalllines(s, True), sexpected)
123 self.assertEqual(readalllines(s, False), sexpectednoends)
124 self.assertEqual(readalllines(s, True, 10), sexpected)
125 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000126
127 # Test long lines (multiple calls to read() in readline())
128 vw = []
129 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000130 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
131 vw.append((i*200)*"\3042" + lineend)
132 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000133 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
134 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
135
136 # Test lines where the first read might end with \r, so the
137 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000138 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000139 for lineend in "\n \r\n \r \u2028".split():
140 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000141 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000142 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000143 self.assertEqual(
144 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000145 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000146 )
147 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000148 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000149 self.assertEqual(
150 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000151 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000152 )
153
154 def test_bug1175396(self):
155 s = [
156 '<%!--===================================================\r\n',
157 ' BLOG index page: show recent articles,\r\n',
158 ' today\'s articles, or articles of a specific date.\r\n',
159 '========================================================--%>\r\n',
160 '<%@inputencoding="ISO-8859-1"%>\r\n',
161 '<%@pagetemplate=TEMPLATE.y%>\r\n',
162 '<%@import=import frog.util, frog%>\r\n',
163 '<%@import=import frog.objects%>\r\n',
164 '<%@import=from frog.storageerrors import StorageError%>\r\n',
165 '<%\r\n',
166 '\r\n',
167 'import logging\r\n',
168 'log=logging.getLogger("Snakelets.logger")\r\n',
169 '\r\n',
170 '\r\n',
171 'user=self.SessionCtx.user\r\n',
172 'storageEngine=self.SessionCtx.storageEngine\r\n',
173 '\r\n',
174 '\r\n',
175 'def readArticlesFromDate(date, count=None):\r\n',
176 ' entryids=storageEngine.listBlogEntries(date)\r\n',
177 ' entryids.reverse() # descending\r\n',
178 ' if count:\r\n',
179 ' entryids=entryids[:count]\r\n',
180 ' try:\r\n',
181 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
182 ' except StorageError,x:\r\n',
183 ' log.error("Error loading articles: "+str(x))\r\n',
184 ' self.abort("cannot load articles")\r\n',
185 '\r\n',
186 'showdate=None\r\n',
187 '\r\n',
188 'arg=self.Request.getArg()\r\n',
189 'if arg=="today":\r\n',
190 ' #-------------------- TODAY\'S ARTICLES\r\n',
191 ' self.write("<h2>Today\'s articles</h2>")\r\n',
192 ' showdate = frog.util.isodatestr() \r\n',
193 ' entries = readArticlesFromDate(showdate)\r\n',
194 'elif arg=="active":\r\n',
195 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
196 ' self.Yredirect("active.y")\r\n',
197 'elif arg=="login":\r\n',
198 ' #-------------------- LOGIN PAGE redirect\r\n',
199 ' self.Yredirect("login.y")\r\n',
200 'elif arg=="date":\r\n',
201 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
202 ' showdate = self.Request.getParameter("date")\r\n',
203 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
204 ' entries = readArticlesFromDate(showdate)\r\n',
205 'else:\r\n',
206 ' #-------------------- RECENT ARTICLES\r\n',
207 ' self.write("<h2>Recent articles</h2>")\r\n',
208 ' dates=storageEngine.listBlogEntryDates()\r\n',
209 ' if dates:\r\n',
210 ' entries=[]\r\n',
211 ' SHOWAMOUNT=10\r\n',
212 ' for showdate in dates:\r\n',
213 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
214 ' if len(entries)>=SHOWAMOUNT:\r\n',
215 ' break\r\n',
216 ' \r\n',
217 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000218 stream = io.BytesIO("".join(s).encode(self.encoding))
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000219 reader = codecs.getreader(self.encoding)(stream)
220 for (i, line) in enumerate(reader):
221 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000222
223 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000224 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000225 writer = codecs.getwriter(self.encoding)(q)
226 reader = codecs.getreader(self.encoding)(q)
227
228 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000229 writer.write("foo\r")
230 self.assertEqual(reader.readline(keepends=False), "foo")
231 writer.write("\nbar\r")
232 self.assertEqual(reader.readline(keepends=False), "")
233 self.assertEqual(reader.readline(keepends=False), "bar")
234 writer.write("baz")
235 self.assertEqual(reader.readline(keepends=False), "baz")
236 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000237
238 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000239 writer.write("foo\r")
240 self.assertEqual(reader.readline(keepends=True), "foo\r")
241 writer.write("\nbar\r")
242 self.assertEqual(reader.readline(keepends=True), "\n")
243 self.assertEqual(reader.readline(keepends=True), "bar\r")
244 writer.write("baz")
245 self.assertEqual(reader.readline(keepends=True), "baz")
246 self.assertEqual(reader.readline(keepends=True), "")
247 writer.write("foo\r\n")
248 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000249
Walter Dörwald9fa09462005-01-10 12:01:39 +0000250 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000251 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
252 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
253 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000254
255 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000256 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000257 reader = codecs.getreader(self.encoding)(stream)
258 self.assertEqual(reader.readline(), s1)
259 self.assertEqual(reader.readline(), s2)
260 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000261 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000262
263 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000264 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
265 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
266 s3 = "stillokay:bbbbxx\r\n"
267 s4 = "broken!!!!badbad\r\n"
268 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000269
270 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000271 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000272 reader = codecs.getreader(self.encoding)(stream)
273 self.assertEqual(reader.readline(), s1)
274 self.assertEqual(reader.readline(), s2)
275 self.assertEqual(reader.readline(), s3)
276 self.assertEqual(reader.readline(), s4)
277 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000278 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000279
Walter Dörwald41980ca2007-08-16 21:55:45 +0000280class UTF32Test(ReadTest):
281 encoding = "utf-32"
282
283 spamle = (b'\xff\xfe\x00\x00'
284 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
285 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
286 spambe = (b'\x00\x00\xfe\xff'
287 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
288 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
289
290 def test_only_one_bom(self):
291 _,_,reader,writer = codecs.lookup(self.encoding)
292 # encode some stream
293 s = io.BytesIO()
294 f = writer(s)
295 f.write("spam")
296 f.write("spam")
297 d = s.getvalue()
298 # check whether there is exactly one BOM in it
Georg Brandlab91fde2009-08-13 08:51:18 +0000299 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000300 # try to read it back
301 s = io.BytesIO(d)
302 f = reader(s)
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000303 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000304
305 def test_badbom(self):
306 s = io.BytesIO(4*b"\xff")
307 f = codecs.getreader(self.encoding)(s)
308 self.assertRaises(UnicodeError, f.read)
309
310 s = io.BytesIO(8*b"\xff")
311 f = codecs.getreader(self.encoding)(s)
312 self.assertRaises(UnicodeError, f.read)
313
314 def test_partial(self):
315 self.check_partial(
316 "\x00\xff\u0100\uffff",
317 [
318 "", # first byte of BOM read
319 "", # second byte of BOM read
320 "", # third byte of BOM read
321 "", # fourth byte of BOM read => byteorder known
322 "",
323 "",
324 "",
325 "\x00",
326 "\x00",
327 "\x00",
328 "\x00",
329 "\x00\xff",
330 "\x00\xff",
331 "\x00\xff",
332 "\x00\xff",
333 "\x00\xff\u0100",
334 "\x00\xff\u0100",
335 "\x00\xff\u0100",
336 "\x00\xff\u0100",
337 "\x00\xff\u0100\uffff",
338 ]
339 )
340
Georg Brandl7b10c9f2009-09-17 11:46:23 +0000341 def test_handlers(self):
342 self.assertEqual(('\ufffd', 1),
343 codecs.utf_32_decode(b'\x01', 'replace', True))
344 self.assertEqual(('', 1),
345 codecs.utf_32_decode(b'\x01', 'ignore', True))
346
Walter Dörwald41980ca2007-08-16 21:55:45 +0000347 def test_errors(self):
348 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
349 b"\xff", "strict", True)
350
351 def test_decoder_state(self):
352 self.check_state_handling_decode(self.encoding,
353 "spamspam", self.spamle)
354 self.check_state_handling_decode(self.encoding,
355 "spamspam", self.spambe)
356
Antoine Pitrou6107a682010-06-11 21:48:34 +0000357 def test_issue8941(self):
358 # Issue #8941: insufficient result allocation when decoding into
359 # surrogate pairs on UCS-2 builds.
360 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
361 self.assertEqual('\U00010000' * 1024,
362 codecs.utf_32_decode(encoded_le)[0])
363 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
364 self.assertEqual('\U00010000' * 1024,
365 codecs.utf_32_decode(encoded_be)[0])
366
Walter Dörwald41980ca2007-08-16 21:55:45 +0000367class UTF32LETest(ReadTest):
368 encoding = "utf-32-le"
369
370 def test_partial(self):
371 self.check_partial(
372 "\x00\xff\u0100\uffff",
373 [
374 "",
375 "",
376 "",
377 "\x00",
378 "\x00",
379 "\x00",
380 "\x00",
381 "\x00\xff",
382 "\x00\xff",
383 "\x00\xff",
384 "\x00\xff",
385 "\x00\xff\u0100",
386 "\x00\xff\u0100",
387 "\x00\xff\u0100",
388 "\x00\xff\u0100",
389 "\x00\xff\u0100\uffff",
390 ]
391 )
392
393 def test_simple(self):
394 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
395
396 def test_errors(self):
397 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
398 b"\xff", "strict", True)
399
Antoine Pitrou6107a682010-06-11 21:48:34 +0000400 def test_issue8941(self):
401 # Issue #8941: insufficient result allocation when decoding into
402 # surrogate pairs on UCS-2 builds.
403 encoded = b'\x00\x00\x01\x00' * 1024
404 self.assertEqual('\U00010000' * 1024,
405 codecs.utf_32_le_decode(encoded)[0])
406
Walter Dörwald41980ca2007-08-16 21:55:45 +0000407class UTF32BETest(ReadTest):
408 encoding = "utf-32-be"
409
410 def test_partial(self):
411 self.check_partial(
412 "\x00\xff\u0100\uffff",
413 [
414 "",
415 "",
416 "",
417 "\x00",
418 "\x00",
419 "\x00",
420 "\x00",
421 "\x00\xff",
422 "\x00\xff",
423 "\x00\xff",
424 "\x00\xff",
425 "\x00\xff\u0100",
426 "\x00\xff\u0100",
427 "\x00\xff\u0100",
428 "\x00\xff\u0100",
429 "\x00\xff\u0100\uffff",
430 ]
431 )
432
433 def test_simple(self):
434 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
435
436 def test_errors(self):
437 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
438 b"\xff", "strict", True)
439
Antoine Pitrou6107a682010-06-11 21:48:34 +0000440 def test_issue8941(self):
441 # Issue #8941: insufficient result allocation when decoding into
442 # surrogate pairs on UCS-2 builds.
443 encoded = b'\x00\x01\x00\x00' * 1024
444 self.assertEqual('\U00010000' * 1024,
445 codecs.utf_32_be_decode(encoded)[0])
446
447
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000448class UTF16Test(ReadTest):
449 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000450
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000451 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
452 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000453
454 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000455 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000456 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000457 s = io.BytesIO()
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000458 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000459 f.write("spam")
460 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000461 d = s.getvalue()
462 # check whether there is exactly one BOM in it
Georg Brandlab91fde2009-08-13 08:51:18 +0000463 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000464 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000465 s = io.BytesIO(d)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000466 f = reader(s)
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000467 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000468
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000469 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000470 s = io.BytesIO(b"\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000471 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000472 self.assertRaises(UnicodeError, f.read)
473
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000474 s = io.BytesIO(b"\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000475 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000476 self.assertRaises(UnicodeError, f.read)
477
Walter Dörwald69652032004-09-07 20:24:22 +0000478 def test_partial(self):
479 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000480 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000481 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000482 "", # first byte of BOM read
483 "", # second byte of BOM read => byteorder known
484 "",
485 "\x00",
486 "\x00",
487 "\x00\xff",
488 "\x00\xff",
489 "\x00\xff\u0100",
490 "\x00\xff\u0100",
491 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000492 ]
493 )
494
Georg Brandl7b10c9f2009-09-17 11:46:23 +0000495 def test_handlers(self):
496 self.assertEqual(('\ufffd', 1),
497 codecs.utf_16_decode(b'\x01', 'replace', True))
498 self.assertEqual(('', 1),
499 codecs.utf_16_decode(b'\x01', 'ignore', True))
500
Walter Dörwalde22d3392005-11-17 08:52:34 +0000501 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000502 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000503 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000504
505 def test_decoder_state(self):
506 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000507 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000508 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000509 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000510
Florent Xiclunae36b2c62010-02-27 11:38:27 +0000511 def test_bug691291(self):
512 # Files are always opened in binary mode, even if no binary mode was
513 # specified. This means that no automatic conversion of '\n' is done
514 # on reading and writing.
515 s1 = 'Hello\r\nworld\r\n'
516
517 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200518 self.addCleanup(support.unlink, support.TESTFN)
519 with open(support.TESTFN, 'wb') as fp:
520 fp.write(s)
521 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
522 self.assertEqual(reader.read(), s1)
Florent Xiclunae36b2c62010-02-27 11:38:27 +0000523
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000524class UTF16LETest(ReadTest):
525 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000526
527 def test_partial(self):
528 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000529 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000530 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000531 "",
532 "\x00",
533 "\x00",
534 "\x00\xff",
535 "\x00\xff",
536 "\x00\xff\u0100",
537 "\x00\xff\u0100",
538 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000539 ]
540 )
541
Walter Dörwalde22d3392005-11-17 08:52:34 +0000542 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000543 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000544 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000545
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000546class UTF16BETest(ReadTest):
547 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000548
549 def test_partial(self):
550 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000551 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000552 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000553 "",
554 "\x00",
555 "\x00",
556 "\x00\xff",
557 "\x00\xff",
558 "\x00\xff\u0100",
559 "\x00\xff\u0100",
560 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000561 ]
562 )
563
Walter Dörwalde22d3392005-11-17 08:52:34 +0000564 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000565 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000566 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000567
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000568class UTF8Test(ReadTest):
569 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000570
571 def test_partial(self):
572 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000573 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000574 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000575 "\x00",
576 "\x00",
577 "\x00\xff",
578 "\x00\xff",
579 "\x00\xff\u07ff",
580 "\x00\xff\u07ff",
581 "\x00\xff\u07ff",
582 "\x00\xff\u07ff\u0800",
583 "\x00\xff\u07ff\u0800",
584 "\x00\xff\u07ff\u0800",
585 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000586 ]
587 )
588
Walter Dörwald3abcb012007-04-16 22:10:50 +0000589 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000590 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000591 self.check_state_handling_decode(self.encoding,
592 u, u.encode(self.encoding))
593
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000594 def test_lone_surrogates(self):
595 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
596 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner158701d2010-04-22 19:41:01 +0000597 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
598 b'[\\udc80]')
599 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
600 b'[&#56448;]')
601 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
602 b'[\x80]')
603 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
604 b'[]')
605 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
606 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000607
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000608 def test_surrogatepass_handler(self):
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000609 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
610 b"abc\xed\xa0\x80def")
611 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
612 "abc\ud800def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000613 self.assertTrue(codecs.lookup_error("surrogatepass"))
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000614
Walter Dörwalde22d3392005-11-17 08:52:34 +0000615class UTF7Test(ReadTest):
616 encoding = "utf-7"
617
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000618 def test_partial(self):
619 self.check_partial(
620 "a+-b",
621 [
622 "a",
623 "a",
624 "a+",
625 "a+-",
626 "a+-b",
627 ]
628 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000629
630class UTF16ExTest(unittest.TestCase):
631
632 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000633 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000634
635 def test_bad_args(self):
636 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
637
638class ReadBufferTest(unittest.TestCase):
639
640 def test_array(self):
641 import array
642 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000643 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000644 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000645 )
646
647 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000648 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000649
650 def test_bad_args(self):
651 self.assertRaises(TypeError, codecs.readbuffer_encode)
652 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
653
654class CharBufferTest(unittest.TestCase):
655
656 def test_string(self):
Guido van Rossum09549f42007-08-27 20:40:10 +0000657 self.assertEqual(codecs.charbuffer_encode(b"spam"), (b"spam", 4))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000658
659 def test_empty(self):
Guido van Rossum09549f42007-08-27 20:40:10 +0000660 self.assertEqual(codecs.charbuffer_encode(b""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000661
662 def test_bad_args(self):
663 self.assertRaises(TypeError, codecs.charbuffer_encode)
664 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
665
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000666class UTF8SigTest(ReadTest):
667 encoding = "utf-8-sig"
668
669 def test_partial(self):
670 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000671 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000672 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000673 "",
674 "",
675 "", # First BOM has been read and skipped
676 "",
677 "",
678 "\ufeff", # Second BOM has been read and emitted
679 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000680 "\ufeff\x00", # First byte of encoded "\xff" read
681 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
682 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
683 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000684 "\ufeff\x00\xff\u07ff",
685 "\ufeff\x00\xff\u07ff",
686 "\ufeff\x00\xff\u07ff\u0800",
687 "\ufeff\x00\xff\u07ff\u0800",
688 "\ufeff\x00\xff\u07ff\u0800",
689 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000690 ]
691 )
692
Thomas Wouters89f507f2006-12-13 04:49:30 +0000693 def test_bug1601501(self):
694 # SF bug #1601501: check that the codec works with a buffer
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000695 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000696
Walter Dörwald3abcb012007-04-16 22:10:50 +0000697 def test_bom(self):
698 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000699 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000700 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
701
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000702 def test_stream_bom(self):
703 unistring = "ABC\u00A1\u2200XYZ"
704 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
705
706 reader = codecs.getreader("utf-8-sig")
707 for sizehint in [None] + list(range(1, 11)) + \
708 [64, 128, 256, 512, 1024]:
709 istream = reader(io.BytesIO(bytestring))
710 ostream = io.StringIO()
711 while 1:
712 if sizehint is not None:
713 data = istream.read(sizehint)
714 else:
715 data = istream.read()
716
717 if not data:
718 break
719 ostream.write(data)
720
721 got = ostream.getvalue()
722 self.assertEqual(got, unistring)
723
724 def test_stream_bare(self):
725 unistring = "ABC\u00A1\u2200XYZ"
726 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
727
728 reader = codecs.getreader("utf-8-sig")
729 for sizehint in [None] + list(range(1, 11)) + \
730 [64, 128, 256, 512, 1024]:
731 istream = reader(io.BytesIO(bytestring))
732 ostream = io.StringIO()
733 while 1:
734 if sizehint is not None:
735 data = istream.read(sizehint)
736 else:
737 data = istream.read()
738
739 if not data:
740 break
741 ostream.write(data)
742
743 got = ostream.getvalue()
744 self.assertEqual(got, unistring)
745
746class EscapeDecodeTest(unittest.TestCase):
747 def test_empty(self):
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000748 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000749
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000750class RecodingTest(unittest.TestCase):
751 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000752 f = io.BytesIO()
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000753 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000754 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000755 f2.close()
756 # Python used to crash on this at exit because of a refcount
757 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000758
Martin v. Löwis2548c732003-04-18 10:39:54 +0000759# From RFC 3492
760punycode_testcases = [
761 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000762 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
763 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000764 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000765 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000766 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000767 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000768 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000769 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000770 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000771 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000772 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
773 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
774 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000775 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000776 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000777 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
778 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
779 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000780 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000781 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000782 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000783 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
784 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
785 "\u0939\u0948\u0902",
786 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000787
788 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000789 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000790 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
791 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000792
793 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000794 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
795 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
796 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000797 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
798 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000799
800 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000801 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
802 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
803 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
804 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000805 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000806
807 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000808 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
809 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
810 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
811 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
812 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000813 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000814
815 # (K) Vietnamese:
816 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
817 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000818 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
819 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
820 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
821 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000822 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000823
Martin v. Löwis2548c732003-04-18 10:39:54 +0000824 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000825 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000826 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000827
Martin v. Löwis2548c732003-04-18 10:39:54 +0000828 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000829 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
830 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
831 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000832 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000833
834 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000835 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
836 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
837 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000838 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000839
840 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000841 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000842 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000843
844 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000845 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
846 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000847 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000848
849 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000850 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000851 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000852
853 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000854 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000855 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000856
857 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000858 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
859 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000860 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000861 ]
862
863for i in punycode_testcases:
864 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000865 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000866
867class PunycodeTest(unittest.TestCase):
868 def test_encode(self):
869 for uni, puny in punycode_testcases:
870 # Need to convert both strings to lower case, since
871 # some of the extended encodings use upper case, but our
872 # code produces only lower case. Converting just puny to
873 # lower is also insufficient, since some of the input characters
874 # are upper case.
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000875 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +0000876 str(uni.encode("punycode"), "ascii").lower(),
877 str(puny, "ascii").lower()
878 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000879
880 def test_decode(self):
881 for uni, puny in punycode_testcases:
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000882 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000883 puny = puny.decode("ascii").encode("ascii")
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000884 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000885
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000886class UnicodeInternalTest(unittest.TestCase):
887 def test_bug1251300(self):
888 # Decoding with unicode_internal used to not correctly handle "code
889 # points" above 0x10ffff on UCS-4 builds.
890 if sys.maxunicode > 0xffff:
891 ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000892 (b"\x00\x10\xff\xff", "\U0010ffff"),
893 (b"\x00\x00\x01\x01", "\U00000101"),
894 (b"", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000895 ]
896 not_ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000897 b"\x7f\xff\xff\xff",
898 b"\x80\x00\x00\x00",
899 b"\x81\x00\x00\x00",
900 b"\x00",
901 b"\x00\x00\x00\x00\x00",
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000902 ]
903 for internal, uni in ok:
904 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000905 internal = bytes(reversed(internal))
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000906 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000907 for internal in not_ok:
908 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000909 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000910 self.assertRaises(UnicodeDecodeError, internal.decode,
911 "unicode_internal")
912
913 def test_decode_error_attributes(self):
914 if sys.maxunicode > 0xffff:
915 try:
Walter Dörwald092a2252007-06-07 11:26:16 +0000916 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +0000917 except UnicodeDecodeError as ex:
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000918 self.assertEqual("unicode_internal", ex.encoding)
919 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
920 self.assertEqual(4, ex.start)
921 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000922 else:
923 self.fail()
924
925 def test_decode_callback(self):
926 if sys.maxunicode > 0xffff:
927 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
928 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossum98297ee2007-11-06 21:34:58 +0000929 ab = "ab".encode("unicode_internal").decode()
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000930 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
931 "ascii"),
932 "UnicodeInternalTest")
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000933 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000934
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000935 def test_encode_length(self):
936 # Issue 3739
937 encoder = codecs.getencoder("unicode_internal")
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000938 self.assertEqual(encoder("a")[1], 1)
939 self.assertEqual(encoder("\xe9\u0142")[1], 2)
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000940
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000941 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenveyddf0d032010-06-09 17:56:11 +0000942
Martin v. Löwis2548c732003-04-18 10:39:54 +0000943# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
944nameprep_tests = [
945 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000946 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
947 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
948 b'\xb8\x8f\xef\xbb\xbf',
949 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000950 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000951 (b'CAFE',
952 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000953 # 3.3 Case folding 8bit U+00DF (german sharp s).
954 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000955 (b'\xc3\x9f',
956 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000957 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000958 (b'\xc4\xb0',
959 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000960 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000961 (b'\xc5\x83\xcd\xba',
962 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000963 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
964 # XXX: skip this as it fails in UCS-2 mode
965 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
966 # 'telc\xe2\x88\x95kg\xcf\x83'),
967 (None, None),
968 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000969 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
970 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000971 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000972 (b'\xe1\xbe\xb7',
973 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000974 # 3.9 Self-reverting case folding U+01F0 and normalization.
975 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000976 (b'\xc7\xb0',
977 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000978 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000979 (b'\xce\x90',
980 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000981 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000982 (b'\xce\xb0',
983 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000984 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000985 (b'\xe1\xba\x96',
986 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000987 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000988 (b'\xe1\xbd\x96',
989 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000990 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000991 (b' ',
992 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000993 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000994 (b'\xc2\xa0',
995 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000996 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000997 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000998 None),
999 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001000 (b'\xe2\x80\x80',
1001 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001002 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001003 (b'\xe2\x80\x8b',
1004 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001005 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001006 (b'\xe3\x80\x80',
1007 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001008 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001009 (b'\x10\x7f',
1010 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001011 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001012 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001013 None),
1014 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001015 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001016 None),
1017 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001018 (b'\xef\xbb\xbf',
1019 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001020 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001021 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001022 None),
1023 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001024 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001025 None),
1026 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001027 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001028 None),
1029 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001030 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001031 None),
1032 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001033 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001034 None),
1035 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001036 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001037 None),
1038 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001039 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001040 None),
1041 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001042 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001043 None),
1044 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001045 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001046 None),
1047 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001048 (b'\xcd\x81',
1049 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001050 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001051 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001052 None),
1053 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001054 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001055 None),
1056 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001057 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001058 None),
1059 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001060 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001061 None),
1062 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001063 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001064 None),
1065 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001066 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001067 None),
1068 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001069 (b'foo\xef\xb9\xb6bar',
1070 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001071 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001072 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001073 None),
1074 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001075 (b'\xd8\xa71\xd8\xa8',
1076 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001077 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001078 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001079 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001080 # None),
1081 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001082 # 3.44 Larger test (shrinking).
1083 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001084 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1085 b'\xaa\xce\xb0\xe2\x80\x80',
1086 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001087 # 3.45 Larger test (expanding).
1088 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001089 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1090 b'\x80',
1091 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1092 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1093 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001094 ]
1095
1096
1097class NameprepTest(unittest.TestCase):
1098 def test_nameprep(self):
1099 from encodings.idna import nameprep
1100 for pos, (orig, prepped) in enumerate(nameprep_tests):
1101 if orig is None:
1102 # Skipped
1103 continue
1104 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001105 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001106 if prepped is None:
1107 # Input contains prohibited characters
1108 self.assertRaises(UnicodeError, nameprep, orig)
1109 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001110 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001111 try:
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001112 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001113 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001114 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001115
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001116class IDNACodecTest(unittest.TestCase):
1117 def test_builtin_decode(self):
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001118 self.assertEqual(str(b"python.org", "idna"), "python.org")
1119 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1120 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1121 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001122
1123 def test_builtin_encode(self):
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001124 self.assertEqual("python.org".encode("idna"), b"python.org")
1125 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1126 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1127 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001128
Martin v. Löwis8b595142005-08-25 11:03:38 +00001129 def test_stream(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001130 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001131 r.read(3)
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001132 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001133
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001134 def test_incremental_decode(self):
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001135 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001136 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001137 "python.org"
1138 )
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001139 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001140 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001141 "python.org."
1142 )
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001143 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001144 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001145 "pyth\xf6n.org."
1146 )
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001147 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001148 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001149 "pyth\xf6n.org."
1150 )
1151
1152 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001153 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1154 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1155 self.assertEqual(decoder.decode(b"rg"), "")
1156 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001157
1158 decoder.reset()
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001159 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1160 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1161 self.assertEqual(decoder.decode(b"rg."), "org.")
1162 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001163
1164 def test_incremental_encode(self):
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001165 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001166 b"".join(codecs.iterencode("python.org", "idna")),
1167 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001168 )
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001169 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001170 b"".join(codecs.iterencode("python.org.", "idna")),
1171 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001172 )
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001173 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001174 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1175 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001176 )
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001177 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001178 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1179 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001180 )
1181
1182 encoder = codecs.getincrementalencoder("idna")()
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001183 self.assertEqual(encoder.encode("\xe4x"), b"")
1184 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1185 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001186
1187 encoder.reset()
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001188 self.assertEqual(encoder.encode("\xe4x"), b"")
1189 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1190 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001191
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001192class CodecsModuleTest(unittest.TestCase):
1193
1194 def test_decode(self):
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001195 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1196 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001197 self.assertRaises(TypeError, codecs.decode)
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001198 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001199 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001200
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001201 def test_encode(self):
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001202 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1203 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001204 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001205 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001206 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001207 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001208
1209 def test_register(self):
1210 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001211 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001212
1213 def test_lookup(self):
1214 self.assertRaises(TypeError, codecs.lookup)
1215 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001216 self.assertRaises(LookupError, codecs.lookup, " ")
1217
1218 def test_getencoder(self):
1219 self.assertRaises(TypeError, codecs.getencoder)
1220 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1221
1222 def test_getdecoder(self):
1223 self.assertRaises(TypeError, codecs.getdecoder)
1224 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1225
1226 def test_getreader(self):
1227 self.assertRaises(TypeError, codecs.getreader)
1228 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1229
1230 def test_getwriter(self):
1231 self.assertRaises(TypeError, codecs.getwriter)
1232 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001233
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001234class StreamReaderTest(unittest.TestCase):
1235
1236 def setUp(self):
1237 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001238 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001239
1240 def test_readlines(self):
1241 f = self.reader(self.stream)
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001242 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001243
Thomas Wouters89f507f2006-12-13 04:49:30 +00001244class EncodedFileTest(unittest.TestCase):
1245
1246 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001247 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001248 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001249 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001250
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001251 f = io.BytesIO()
Thomas Wouters89f507f2006-12-13 04:49:30 +00001252 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001253 ef.write(b'\xc3\xbc')
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001254 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001255
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001256all_unicode_encodings = [
1257 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001258 "big5",
1259 "big5hkscs",
1260 "charmap",
1261 "cp037",
1262 "cp1006",
1263 "cp1026",
1264 "cp1140",
1265 "cp1250",
1266 "cp1251",
1267 "cp1252",
1268 "cp1253",
1269 "cp1254",
1270 "cp1255",
1271 "cp1256",
1272 "cp1257",
1273 "cp1258",
1274 "cp424",
1275 "cp437",
1276 "cp500",
1277 "cp737",
1278 "cp775",
1279 "cp850",
1280 "cp852",
1281 "cp855",
1282 "cp856",
1283 "cp857",
1284 "cp860",
1285 "cp861",
1286 "cp862",
1287 "cp863",
1288 "cp864",
1289 "cp865",
1290 "cp866",
1291 "cp869",
1292 "cp874",
1293 "cp875",
1294 "cp932",
1295 "cp949",
1296 "cp950",
1297 "euc_jis_2004",
1298 "euc_jisx0213",
1299 "euc_jp",
1300 "euc_kr",
1301 "gb18030",
1302 "gb2312",
1303 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001304 "hp_roman8",
1305 "hz",
1306 "idna",
1307 "iso2022_jp",
1308 "iso2022_jp_1",
1309 "iso2022_jp_2",
1310 "iso2022_jp_2004",
1311 "iso2022_jp_3",
1312 "iso2022_jp_ext",
1313 "iso2022_kr",
1314 "iso8859_1",
1315 "iso8859_10",
1316 "iso8859_11",
1317 "iso8859_13",
1318 "iso8859_14",
1319 "iso8859_15",
1320 "iso8859_16",
1321 "iso8859_2",
1322 "iso8859_3",
1323 "iso8859_4",
1324 "iso8859_5",
1325 "iso8859_6",
1326 "iso8859_7",
1327 "iso8859_8",
1328 "iso8859_9",
1329 "johab",
1330 "koi8_r",
1331 "koi8_u",
1332 "latin_1",
1333 "mac_cyrillic",
1334 "mac_greek",
1335 "mac_iceland",
1336 "mac_latin2",
1337 "mac_roman",
1338 "mac_turkish",
1339 "palmos",
1340 "ptcp154",
1341 "punycode",
1342 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001343 "shift_jis",
1344 "shift_jis_2004",
1345 "shift_jisx0213",
1346 "tis_620",
1347 "unicode_escape",
1348 "unicode_internal",
1349 "utf_16",
1350 "utf_16_be",
1351 "utf_16_le",
1352 "utf_7",
1353 "utf_8",
1354]
1355
1356if hasattr(codecs, "mbcs_encode"):
1357 all_unicode_encodings.append("mbcs")
1358
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001359# The following encoding is not tested, because it's not supposed
1360# to work:
1361# "undefined"
1362
1363# The following encodings don't work in stateful mode
1364broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001365 "punycode",
1366 "unicode_internal"
1367]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001368broken_incremental_coders = broken_unicode_with_streams + [
1369 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001370]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001371
1372# The following encodings only support "strict" mode
1373only_strict_mode = [
1374 "idna",
Thomas Wouters89f507f2006-12-13 04:49:30 +00001375]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001376
Walter Dörwald3abcb012007-04-16 22:10:50 +00001377class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001378 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001379 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001380 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001381 name = codecs.lookup(encoding).name
1382 if encoding.endswith("_codec"):
1383 name += "_codec"
1384 elif encoding == "latin_1":
1385 name = "latin_1"
1386 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001387 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001388 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001389 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001390 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1391
1392 if encoding not in broken_unicode_with_streams:
1393 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001394 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001395 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001396 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001397 for c in s:
1398 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001399 chunk = q.read()
Georg Brandlab91fde2009-08-13 08:51:18 +00001400 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001401 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001402 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001403 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001404 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001405 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001406 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001407 decodedresult += reader.read()
1408 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1409
Thomas Wouters89f507f2006-12-13 04:49:30 +00001410 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001411 # check incremental decoder/encoder (fetched via the Python
1412 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001413 try:
1414 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001415 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001416 except LookupError: # no IncrementalEncoder
1417 pass
1418 else:
1419 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001420 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001421 for c in s:
1422 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001423 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001424 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001425 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001426 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001427 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001428 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001429 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1430
1431 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001432 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001433 for c in s:
1434 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001435 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001436 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001437 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001438 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001439 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001440 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001441 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1442
1443 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001444 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001445 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1446
1447 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001448 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1449 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001450
Thomas Wouters89f507f2006-12-13 04:49:30 +00001451 if encoding not in only_strict_mode:
1452 # check incremental decoder/encoder with errors argument
1453 try:
1454 encoder = codecs.getincrementalencoder(encoding)("ignore")
1455 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1456 except LookupError: # no IncrementalEncoder
1457 pass
1458 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001459 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001460 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001461 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001462 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1463
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001464 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001465 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001466 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001467 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1468
Walter Dörwald729c31f2005-03-14 19:06:30 +00001469 def test_seek(self):
1470 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001471 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001472 for encoding in all_unicode_encodings:
1473 if encoding == "idna": # FIXME: See SF bug #1163178
1474 continue
1475 if encoding in broken_unicode_with_streams:
1476 continue
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001477 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001478 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001479 # Test that calling seek resets the internal codec state and buffers
1480 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001481 data = reader.read()
1482 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001483
Walter Dörwalde22d3392005-11-17 08:52:34 +00001484 def test_bad_decode_args(self):
1485 for encoding in all_unicode_encodings:
1486 decoder = codecs.getdecoder(encoding)
1487 self.assertRaises(TypeError, decoder)
1488 if encoding not in ("idna", "punycode"):
1489 self.assertRaises(TypeError, decoder, 42)
1490
1491 def test_bad_encode_args(self):
1492 for encoding in all_unicode_encodings:
1493 encoder = codecs.getencoder(encoding)
1494 self.assertRaises(TypeError, encoder)
1495
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001496 def test_encoding_map_type_initialized(self):
1497 from encodings import cp1140
1498 # This used to crash, we are only verifying there's no crash.
1499 table_type = type(cp1140.encoding_table)
1500 self.assertEqual(table_type, table_type)
1501
Walter Dörwald3abcb012007-04-16 22:10:50 +00001502 def test_decoder_state(self):
1503 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001504 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001505 for encoding in all_unicode_encodings:
1506 if encoding not in broken_incremental_coders:
1507 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1508 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1509
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001510class CharmapTest(unittest.TestCase):
1511 def test_decode_with_string_map(self):
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001512 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001513 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001514 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001515 )
1516
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001517 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001518 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001519 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001520 )
1521
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001522 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001523 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001524 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001525 )
1526
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001527 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001528 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001529 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001530 )
1531
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001532 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001533 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001534 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001535 )
1536
Guido van Rossum805365e2007-05-07 22:24:25 +00001537 allbytes = bytes(range(256))
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001538 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001539 codecs.charmap_decode(allbytes, "ignore", ""),
1540 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001541 )
1542
Thomas Wouters89f507f2006-12-13 04:49:30 +00001543class WithStmtTest(unittest.TestCase):
1544 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001545 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001546 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001547 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001548
1549 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001550 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001551 info = codecs.lookup("utf-8")
1552 with codecs.StreamReaderWriter(f, info.streamreader,
1553 info.streamwriter, 'strict') as srw:
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001554 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001555
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001556class TypesTest(unittest.TestCase):
1557 def test_decode_unicode(self):
1558 # Most decoders don't accept unicode input
1559 decoders = [
1560 codecs.utf_7_decode,
1561 codecs.utf_8_decode,
1562 codecs.utf_16_le_decode,
1563 codecs.utf_16_be_decode,
1564 codecs.utf_16_ex_decode,
1565 codecs.utf_32_decode,
1566 codecs.utf_32_le_decode,
1567 codecs.utf_32_be_decode,
1568 codecs.utf_32_ex_decode,
1569 codecs.latin_1_decode,
1570 codecs.ascii_decode,
1571 codecs.charmap_decode,
1572 ]
1573 if hasattr(codecs, "mbcs_decode"):
1574 decoders.append(codecs.mbcs_decode)
1575 for decoder in decoders:
1576 self.assertRaises(TypeError, decoder, "xxx")
1577
1578 def test_unicode_escape(self):
1579 # Escape-decoding an unicode string is supported ang gives the same
1580 # result as decoding the equivalent ASCII bytes string.
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001581 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1582 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1583 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1584 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001585
Martin v. Löwis43c57782009-05-10 08:15:24 +00001586class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001587
1588 def test_utf8(self):
1589 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001590 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001591 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001592 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001593 b"foo\x80bar")
1594 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001595 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001596 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001597 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001598 b"\xed\xb0\x80")
1599
1600 def test_ascii(self):
1601 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001602 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001603 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001604 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001605 b"foo\x80bar")
1606
1607 def test_charmap(self):
1608 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001609 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001610 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001611 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001612 b"foo\xa5bar")
1613
Amaury Forgeot d'Arce5344d62009-06-29 22:38:54 +00001614 def test_latin1(self):
1615 # Issue6373
1616 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin1", "surrogateescape"),
1617 b"\xe4\xeb\xef\xf6\xfc")
1618
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001619
Victor Stinner37b82002010-05-22 02:17:42 +00001620class BomTest(unittest.TestCase):
1621 def test_seek0(self):
1622 data = "1234567890"
1623 tests = ("utf-16",
1624 "utf-16-le",
1625 "utf-16-be",
1626 "utf-32",
1627 "utf-32-le",
1628 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02001629 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner37b82002010-05-22 02:17:42 +00001630 for encoding in tests:
Victor Stinnerb64d0eb2010-05-22 17:01:13 +00001631 # Check if the BOM is written only once
1632 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner37b82002010-05-22 02:17:42 +00001633 f.write(data)
1634 f.write(data)
1635 f.seek(0)
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001636 self.assertEqual(f.read(), data * 2)
Victor Stinner37b82002010-05-22 02:17:42 +00001637 f.seek(0)
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001638 self.assertEqual(f.read(), data * 2)
Victor Stinner37b82002010-05-22 02:17:42 +00001639
Victor Stinnerb64d0eb2010-05-22 17:01:13 +00001640 # Check that the BOM is written after a seek(0)
1641 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1642 f.write(data[0])
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001643 self.assertNotEqual(f.tell(), 0)
Victor Stinnerb64d0eb2010-05-22 17:01:13 +00001644 f.seek(0)
1645 f.write(data)
1646 f.seek(0)
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001647 self.assertEqual(f.read(), data)
Victor Stinnerb64d0eb2010-05-22 17:01:13 +00001648
1649 # (StreamWriter) Check that the BOM is written after a seek(0)
1650 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1651 f.writer.write(data[0])
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001652 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinnerb64d0eb2010-05-22 17:01:13 +00001653 f.writer.seek(0)
1654 f.writer.write(data)
1655 f.seek(0)
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001656 self.assertEqual(f.read(), data)
Victor Stinnerb64d0eb2010-05-22 17:01:13 +00001657
1658 # Check that the BOM is not written after a seek() at a position
1659 # different than the start
1660 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1661 f.write(data)
1662 f.seek(f.tell())
1663 f.write(data)
1664 f.seek(0)
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001665 self.assertEqual(f.read(), data * 2)
Victor Stinnerb64d0eb2010-05-22 17:01:13 +00001666
1667 # (StreamWriter) Check that the BOM is not written after a seek()
1668 # at a position different than the start
1669 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1670 f.writer.write(data)
1671 f.writer.seek(f.writer.tell())
1672 f.writer.write(data)
1673 f.seek(0)
Ezio Melotti19f2aeb2010-11-21 01:30:29 +00001674 self.assertEqual(f.read(), data * 2)
Victor Stinnerb64d0eb2010-05-22 17:01:13 +00001675
Victor Stinner37b82002010-05-22 02:17:42 +00001676
Fred Drake2e2be372001-09-20 21:33:42 +00001677def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001678 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00001679 UTF32Test,
1680 UTF32LETest,
1681 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001682 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001683 UTF16LETest,
1684 UTF16BETest,
1685 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001686 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001687 UTF7Test,
1688 UTF16ExTest,
1689 ReadBufferTest,
1690 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001691 RecodingTest,
1692 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001693 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001694 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001695 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001696 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001697 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001698 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001699 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001700 CharmapTest,
1701 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001702 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001703 SurrogateEscapeTest,
Victor Stinner37b82002010-05-22 02:17:42 +00001704 BomTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001705 )
Fred Drake2e2be372001-09-20 21:33:42 +00001706
1707
1708if __name__ == "__main__":
1709 test_main()