blob: 3426a4dde9a9a98d2ff7f9fc768a65156466a227 [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Barry Warsaw04f357c2002-07-23 19:04:11 +00002import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00005import sys, _testcapi, io
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Walter Dörwald69652032004-09-07 20:24:22 +00007class Queue(object):
8 """
9 queue: write bytes at one end, read bytes from the other end
10 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000011 def __init__(self, buffer):
12 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000013
14 def write(self, chars):
15 self._buffer += chars
16
17 def read(self, size=-1):
18 if size<0:
19 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000020 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000021 return s
22 else:
23 s = self._buffer[:size]
24 self._buffer = self._buffer[size:]
25 return s
26
Walter Dörwald3abcb012007-04-16 22:10:50 +000027class MixInCheckStateHandling:
28 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000029 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000030 d = codecs.getincrementaldecoder(encoding)()
31 part1 = d.decode(s[:i])
32 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000033 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000034 # Check that the condition stated in the documentation for
35 # IncrementalDecoder.getstate() holds
36 if not state[1]:
37 # reset decoder to the default state without anything buffered
38 d.setstate((state[0][:0], 0))
39 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000040 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000041 # The decoder must return to the same state
42 self.assertEqual(state, d.getstate())
43 # Create a new decoder and set it to the state
44 # we extracted from the old one
45 d = codecs.getincrementaldecoder(encoding)()
46 d.setstate(state)
47 part2 = d.decode(s[i:], True)
48 self.assertEqual(u, part1+part2)
49
50 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000051 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000052 d = codecs.getincrementalencoder(encoding)()
53 part1 = d.encode(u[:i])
54 state = d.getstate()
55 d = codecs.getincrementalencoder(encoding)()
56 d.setstate(state)
57 part2 = d.encode(u[i:], True)
58 self.assertEqual(s, part1+part2)
59
60class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000061 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000062 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000063 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000064 # the StreamReader and check that the results equal the appropriate
65 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000066 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +000067 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000068 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000069 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000070 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000071 result += r.read()
72 self.assertEqual(result, partialresult)
73 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000074 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000075 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000076
Thomas Woutersa9773292006-04-21 09:43:23 +000077 # do the check again, this time using a incremental decoder
78 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000079 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000080 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000081 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000082 self.assertEqual(result, partialresult)
83 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000084 self.assertEqual(d.decode(b"", True), "")
85 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000086
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000088 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000089 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000090 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000091 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000092 self.assertEqual(result, partialresult)
93 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000094 self.assertEqual(d.decode(b"", True), "")
95 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000096
97 # check iterdecode()
98 encoded = input.encode(self.encoding)
99 self.assertEqual(
100 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000102 )
103
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000104 def test_readline(self):
105 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000106 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000107 return codecs.getreader(self.encoding)(stream)
108
Walter Dörwaldca199432006-03-06 22:39:12 +0000109 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000110 reader = getreader(input)
111 lines = []
112 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000113 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000114 if not line:
115 break
116 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000117 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000118
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000119 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
120 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
121 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000122 self.assertEqual(readalllines(s, True), sexpected)
123 self.assertEqual(readalllines(s, False), sexpectednoends)
124 self.assertEqual(readalllines(s, True, 10), sexpected)
125 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000126
127 # Test long lines (multiple calls to read() in readline())
128 vw = []
129 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000130 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
131 vw.append((i*200)*"\3042" + lineend)
132 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000133 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
134 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
135
136 # Test lines where the first read might end with \r, so the
137 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000138 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000139 for lineend in "\n \r\n \r \u2028".split():
140 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000141 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000142 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000143 self.assertEqual(
144 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000145 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000146 )
147 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000148 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000149 self.assertEqual(
150 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000151 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000152 )
153
154 def test_bug1175396(self):
155 s = [
156 '<%!--===================================================\r\n',
157 ' BLOG index page: show recent articles,\r\n',
158 ' today\'s articles, or articles of a specific date.\r\n',
159 '========================================================--%>\r\n',
160 '<%@inputencoding="ISO-8859-1"%>\r\n',
161 '<%@pagetemplate=TEMPLATE.y%>\r\n',
162 '<%@import=import frog.util, frog%>\r\n',
163 '<%@import=import frog.objects%>\r\n',
164 '<%@import=from frog.storageerrors import StorageError%>\r\n',
165 '<%\r\n',
166 '\r\n',
167 'import logging\r\n',
168 'log=logging.getLogger("Snakelets.logger")\r\n',
169 '\r\n',
170 '\r\n',
171 'user=self.SessionCtx.user\r\n',
172 'storageEngine=self.SessionCtx.storageEngine\r\n',
173 '\r\n',
174 '\r\n',
175 'def readArticlesFromDate(date, count=None):\r\n',
176 ' entryids=storageEngine.listBlogEntries(date)\r\n',
177 ' entryids.reverse() # descending\r\n',
178 ' if count:\r\n',
179 ' entryids=entryids[:count]\r\n',
180 ' try:\r\n',
181 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
182 ' except StorageError,x:\r\n',
183 ' log.error("Error loading articles: "+str(x))\r\n',
184 ' self.abort("cannot load articles")\r\n',
185 '\r\n',
186 'showdate=None\r\n',
187 '\r\n',
188 'arg=self.Request.getArg()\r\n',
189 'if arg=="today":\r\n',
190 ' #-------------------- TODAY\'S ARTICLES\r\n',
191 ' self.write("<h2>Today\'s articles</h2>")\r\n',
192 ' showdate = frog.util.isodatestr() \r\n',
193 ' entries = readArticlesFromDate(showdate)\r\n',
194 'elif arg=="active":\r\n',
195 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
196 ' self.Yredirect("active.y")\r\n',
197 'elif arg=="login":\r\n',
198 ' #-------------------- LOGIN PAGE redirect\r\n',
199 ' self.Yredirect("login.y")\r\n',
200 'elif arg=="date":\r\n',
201 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
202 ' showdate = self.Request.getParameter("date")\r\n',
203 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
204 ' entries = readArticlesFromDate(showdate)\r\n',
205 'else:\r\n',
206 ' #-------------------- RECENT ARTICLES\r\n',
207 ' self.write("<h2>Recent articles</h2>")\r\n',
208 ' dates=storageEngine.listBlogEntryDates()\r\n',
209 ' if dates:\r\n',
210 ' entries=[]\r\n',
211 ' SHOWAMOUNT=10\r\n',
212 ' for showdate in dates:\r\n',
213 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
214 ' if len(entries)>=SHOWAMOUNT:\r\n',
215 ' break\r\n',
216 ' \r\n',
217 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000218 stream = io.BytesIO("".join(s).encode(self.encoding))
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000219 reader = codecs.getreader(self.encoding)(stream)
220 for (i, line) in enumerate(reader):
221 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000222
223 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000224 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000225 writer = codecs.getwriter(self.encoding)(q)
226 reader = codecs.getreader(self.encoding)(q)
227
228 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000229 writer.write("foo\r")
230 self.assertEqual(reader.readline(keepends=False), "foo")
231 writer.write("\nbar\r")
232 self.assertEqual(reader.readline(keepends=False), "")
233 self.assertEqual(reader.readline(keepends=False), "bar")
234 writer.write("baz")
235 self.assertEqual(reader.readline(keepends=False), "baz")
236 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000237
238 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000239 writer.write("foo\r")
240 self.assertEqual(reader.readline(keepends=True), "foo\r")
241 writer.write("\nbar\r")
242 self.assertEqual(reader.readline(keepends=True), "\n")
243 self.assertEqual(reader.readline(keepends=True), "bar\r")
244 writer.write("baz")
245 self.assertEqual(reader.readline(keepends=True), "baz")
246 self.assertEqual(reader.readline(keepends=True), "")
247 writer.write("foo\r\n")
248 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000249
Walter Dörwald9fa09462005-01-10 12:01:39 +0000250 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000251 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
252 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
253 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000254
255 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000256 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000257 reader = codecs.getreader(self.encoding)(stream)
258 self.assertEqual(reader.readline(), s1)
259 self.assertEqual(reader.readline(), s2)
260 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000261 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000262
263 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000264 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
265 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
266 s3 = "stillokay:bbbbxx\r\n"
267 s4 = "broken!!!!badbad\r\n"
268 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000269
270 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000271 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000272 reader = codecs.getreader(self.encoding)(stream)
273 self.assertEqual(reader.readline(), s1)
274 self.assertEqual(reader.readline(), s2)
275 self.assertEqual(reader.readline(), s3)
276 self.assertEqual(reader.readline(), s4)
277 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000278 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000279
Walter Dörwald41980ca2007-08-16 21:55:45 +0000280class UTF32Test(ReadTest):
281 encoding = "utf-32"
282
283 spamle = (b'\xff\xfe\x00\x00'
284 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
285 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
286 spambe = (b'\x00\x00\xfe\xff'
287 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
288 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
289
290 def test_only_one_bom(self):
291 _,_,reader,writer = codecs.lookup(self.encoding)
292 # encode some stream
293 s = io.BytesIO()
294 f = writer(s)
295 f.write("spam")
296 f.write("spam")
297 d = s.getvalue()
298 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000299 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000300 # try to read it back
301 s = io.BytesIO(d)
302 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000303 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000304
305 def test_badbom(self):
306 s = io.BytesIO(4*b"\xff")
307 f = codecs.getreader(self.encoding)(s)
308 self.assertRaises(UnicodeError, f.read)
309
310 s = io.BytesIO(8*b"\xff")
311 f = codecs.getreader(self.encoding)(s)
312 self.assertRaises(UnicodeError, f.read)
313
314 def test_partial(self):
315 self.check_partial(
316 "\x00\xff\u0100\uffff",
317 [
318 "", # first byte of BOM read
319 "", # second byte of BOM read
320 "", # third byte of BOM read
321 "", # fourth byte of BOM read => byteorder known
322 "",
323 "",
324 "",
325 "\x00",
326 "\x00",
327 "\x00",
328 "\x00",
329 "\x00\xff",
330 "\x00\xff",
331 "\x00\xff",
332 "\x00\xff",
333 "\x00\xff\u0100",
334 "\x00\xff\u0100",
335 "\x00\xff\u0100",
336 "\x00\xff\u0100",
337 "\x00\xff\u0100\uffff",
338 ]
339 )
340
Georg Brandl791f4e12009-09-17 11:41:24 +0000341 def test_handlers(self):
342 self.assertEqual(('\ufffd', 1),
343 codecs.utf_32_decode(b'\x01', 'replace', True))
344 self.assertEqual(('', 1),
345 codecs.utf_32_decode(b'\x01', 'ignore', True))
346
Walter Dörwald41980ca2007-08-16 21:55:45 +0000347 def test_errors(self):
348 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
349 b"\xff", "strict", True)
350
351 def test_decoder_state(self):
352 self.check_state_handling_decode(self.encoding,
353 "spamspam", self.spamle)
354 self.check_state_handling_decode(self.encoding,
355 "spamspam", self.spambe)
356
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000357 def test_issue8941(self):
358 # Issue #8941: insufficient result allocation when decoding into
359 # surrogate pairs on UCS-2 builds.
360 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
361 self.assertEqual('\U00010000' * 1024,
362 codecs.utf_32_decode(encoded_le)[0])
363 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
364 self.assertEqual('\U00010000' * 1024,
365 codecs.utf_32_decode(encoded_be)[0])
366
Walter Dörwald41980ca2007-08-16 21:55:45 +0000367class UTF32LETest(ReadTest):
368 encoding = "utf-32-le"
369
370 def test_partial(self):
371 self.check_partial(
372 "\x00\xff\u0100\uffff",
373 [
374 "",
375 "",
376 "",
377 "\x00",
378 "\x00",
379 "\x00",
380 "\x00",
381 "\x00\xff",
382 "\x00\xff",
383 "\x00\xff",
384 "\x00\xff",
385 "\x00\xff\u0100",
386 "\x00\xff\u0100",
387 "\x00\xff\u0100",
388 "\x00\xff\u0100",
389 "\x00\xff\u0100\uffff",
390 ]
391 )
392
393 def test_simple(self):
394 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
395
396 def test_errors(self):
397 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
398 b"\xff", "strict", True)
399
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000400 def test_issue8941(self):
401 # Issue #8941: insufficient result allocation when decoding into
402 # surrogate pairs on UCS-2 builds.
403 encoded = b'\x00\x00\x01\x00' * 1024
404 self.assertEqual('\U00010000' * 1024,
405 codecs.utf_32_le_decode(encoded)[0])
406
Walter Dörwald41980ca2007-08-16 21:55:45 +0000407class UTF32BETest(ReadTest):
408 encoding = "utf-32-be"
409
410 def test_partial(self):
411 self.check_partial(
412 "\x00\xff\u0100\uffff",
413 [
414 "",
415 "",
416 "",
417 "\x00",
418 "\x00",
419 "\x00",
420 "\x00",
421 "\x00\xff",
422 "\x00\xff",
423 "\x00\xff",
424 "\x00\xff",
425 "\x00\xff\u0100",
426 "\x00\xff\u0100",
427 "\x00\xff\u0100",
428 "\x00\xff\u0100",
429 "\x00\xff\u0100\uffff",
430 ]
431 )
432
433 def test_simple(self):
434 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
435
436 def test_errors(self):
437 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
438 b"\xff", "strict", True)
439
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000440 def test_issue8941(self):
441 # Issue #8941: insufficient result allocation when decoding into
442 # surrogate pairs on UCS-2 builds.
443 encoded = b'\x00\x01\x00\x00' * 1024
444 self.assertEqual('\U00010000' * 1024,
445 codecs.utf_32_be_decode(encoded)[0])
446
447
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000448class UTF16Test(ReadTest):
449 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000450
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000451 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
452 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000453
454 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000455 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000456 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000457 s = io.BytesIO()
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000458 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000459 f.write("spam")
460 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000461 d = s.getvalue()
462 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000463 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000464 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000465 s = io.BytesIO(d)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000466 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000467 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000468
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000469 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000470 s = io.BytesIO(b"\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000471 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000472 self.assertRaises(UnicodeError, f.read)
473
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000474 s = io.BytesIO(b"\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000475 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000476 self.assertRaises(UnicodeError, f.read)
477
Walter Dörwald69652032004-09-07 20:24:22 +0000478 def test_partial(self):
479 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000480 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000481 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000482 "", # first byte of BOM read
483 "", # second byte of BOM read => byteorder known
484 "",
485 "\x00",
486 "\x00",
487 "\x00\xff",
488 "\x00\xff",
489 "\x00\xff\u0100",
490 "\x00\xff\u0100",
491 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000492 ]
493 )
494
Georg Brandl791f4e12009-09-17 11:41:24 +0000495 def test_handlers(self):
496 self.assertEqual(('\ufffd', 1),
497 codecs.utf_16_decode(b'\x01', 'replace', True))
498 self.assertEqual(('', 1),
499 codecs.utf_16_decode(b'\x01', 'ignore', True))
500
Walter Dörwalde22d3392005-11-17 08:52:34 +0000501 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000502 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000503 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000504
505 def test_decoder_state(self):
506 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000507 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000508 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000509 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000510
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000511 def test_bug691291(self):
512 # Files are always opened in binary mode, even if no binary mode was
513 # specified. This means that no automatic conversion of '\n' is done
514 # on reading and writing.
515 s1 = 'Hello\r\nworld\r\n'
516
517 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200518 self.addCleanup(support.unlink, support.TESTFN)
519 with open(support.TESTFN, 'wb') as fp:
520 fp.write(s)
521 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
522 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000523
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000524class UTF16LETest(ReadTest):
525 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000526
527 def test_partial(self):
528 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000529 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000530 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000531 "",
532 "\x00",
533 "\x00",
534 "\x00\xff",
535 "\x00\xff",
536 "\x00\xff\u0100",
537 "\x00\xff\u0100",
538 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000539 ]
540 )
541
Walter Dörwalde22d3392005-11-17 08:52:34 +0000542 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200543 tests = [
544 (b'\xff', '\ufffd'),
545 (b'A\x00Z', 'A\ufffd'),
546 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
547 (b'\x00\xd8', '\ufffd'),
548 (b'\x00\xd8A', '\ufffd'),
549 (b'\x00\xd8A\x00', '\ufffdA'),
550 (b'\x00\xdcA\x00', '\ufffdA'),
551 ]
552 for raw, expected in tests:
553 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
554 raw, 'strict', True)
555 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000556
Victor Stinner53a9dd72010-12-08 22:25:45 +0000557 def test_nonbmp(self):
558 self.assertEqual("\U00010203".encode(self.encoding),
559 b'\x00\xd8\x03\xde')
560 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
561 "\U00010203")
562
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000563class UTF16BETest(ReadTest):
564 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000565
566 def test_partial(self):
567 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000568 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000569 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000570 "",
571 "\x00",
572 "\x00",
573 "\x00\xff",
574 "\x00\xff",
575 "\x00\xff\u0100",
576 "\x00\xff\u0100",
577 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000578 ]
579 )
580
Walter Dörwalde22d3392005-11-17 08:52:34 +0000581 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200582 tests = [
583 (b'\xff', '\ufffd'),
584 (b'\x00A\xff', 'A\ufffd'),
585 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
586 (b'\xd8\x00', '\ufffd'),
587 (b'\xd8\x00\xdc', '\ufffd'),
588 (b'\xd8\x00\x00A', '\ufffdA'),
589 (b'\xdc\x00\x00A', '\ufffdA'),
590 ]
591 for raw, expected in tests:
592 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
593 raw, 'strict', True)
594 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000595
Victor Stinner53a9dd72010-12-08 22:25:45 +0000596 def test_nonbmp(self):
597 self.assertEqual("\U00010203".encode(self.encoding),
598 b'\xd8\x00\xde\x03')
599 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
600 "\U00010203")
601
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000602class UTF8Test(ReadTest):
603 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000604
605 def test_partial(self):
606 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000607 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000608 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000609 "\x00",
610 "\x00",
611 "\x00\xff",
612 "\x00\xff",
613 "\x00\xff\u07ff",
614 "\x00\xff\u07ff",
615 "\x00\xff\u07ff",
616 "\x00\xff\u07ff\u0800",
617 "\x00\xff\u07ff\u0800",
618 "\x00\xff\u07ff\u0800",
619 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000620 ]
621 )
622
Walter Dörwald3abcb012007-04-16 22:10:50 +0000623 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000624 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000625 self.check_state_handling_decode(self.encoding,
626 u, u.encode(self.encoding))
627
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000628 def test_lone_surrogates(self):
629 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
630 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000631 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
632 b'[\\udc80]')
633 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
634 b'[&#56448;]')
635 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
636 b'[\x80]')
637 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
638 b'[]')
639 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
640 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000641
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000642 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000643 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
644 b"abc\xed\xa0\x80def")
645 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
646 "abc\ud800def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000647 self.assertTrue(codecs.lookup_error("surrogatepass"))
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000648
Walter Dörwalde22d3392005-11-17 08:52:34 +0000649class UTF7Test(ReadTest):
650 encoding = "utf-7"
651
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000652 def test_partial(self):
653 self.check_partial(
654 "a+-b",
655 [
656 "a",
657 "a",
658 "a+",
659 "a+-",
660 "a+-b",
661 ]
662 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000663
664class UTF16ExTest(unittest.TestCase):
665
666 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000667 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000668
669 def test_bad_args(self):
670 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
671
672class ReadBufferTest(unittest.TestCase):
673
674 def test_array(self):
675 import array
676 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000677 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000678 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000679 )
680
681 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000682 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000683
684 def test_bad_args(self):
685 self.assertRaises(TypeError, codecs.readbuffer_encode)
686 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
687
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000688class UTF8SigTest(ReadTest):
689 encoding = "utf-8-sig"
690
691 def test_partial(self):
692 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000693 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000694 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000695 "",
696 "",
697 "", # First BOM has been read and skipped
698 "",
699 "",
700 "\ufeff", # Second BOM has been read and emitted
701 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000702 "\ufeff\x00", # First byte of encoded "\xff" read
703 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
704 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
705 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000706 "\ufeff\x00\xff\u07ff",
707 "\ufeff\x00\xff\u07ff",
708 "\ufeff\x00\xff\u07ff\u0800",
709 "\ufeff\x00\xff\u07ff\u0800",
710 "\ufeff\x00\xff\u07ff\u0800",
711 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000712 ]
713 )
714
Thomas Wouters89f507f2006-12-13 04:49:30 +0000715 def test_bug1601501(self):
716 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000717 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000718
Walter Dörwald3abcb012007-04-16 22:10:50 +0000719 def test_bom(self):
720 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000721 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000722 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
723
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000724 def test_stream_bom(self):
725 unistring = "ABC\u00A1\u2200XYZ"
726 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
727
728 reader = codecs.getreader("utf-8-sig")
729 for sizehint in [None] + list(range(1, 11)) + \
730 [64, 128, 256, 512, 1024]:
731 istream = reader(io.BytesIO(bytestring))
732 ostream = io.StringIO()
733 while 1:
734 if sizehint is not None:
735 data = istream.read(sizehint)
736 else:
737 data = istream.read()
738
739 if not data:
740 break
741 ostream.write(data)
742
743 got = ostream.getvalue()
744 self.assertEqual(got, unistring)
745
746 def test_stream_bare(self):
747 unistring = "ABC\u00A1\u2200XYZ"
748 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
749
750 reader = codecs.getreader("utf-8-sig")
751 for sizehint in [None] + list(range(1, 11)) + \
752 [64, 128, 256, 512, 1024]:
753 istream = reader(io.BytesIO(bytestring))
754 ostream = io.StringIO()
755 while 1:
756 if sizehint is not None:
757 data = istream.read(sizehint)
758 else:
759 data = istream.read()
760
761 if not data:
762 break
763 ostream.write(data)
764
765 got = ostream.getvalue()
766 self.assertEqual(got, unistring)
767
768class EscapeDecodeTest(unittest.TestCase):
769 def test_empty(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000770 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000771
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000772class RecodingTest(unittest.TestCase):
773 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000774 f = io.BytesIO()
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000775 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000776 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000777 f2.close()
778 # Python used to crash on this at exit because of a refcount
779 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000780
Martin v. Löwis2548c732003-04-18 10:39:54 +0000781# From RFC 3492
782punycode_testcases = [
783 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000784 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
785 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000786 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000787 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000788 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000789 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000790 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000791 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000792 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000793 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000794 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
795 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
796 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000797 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000798 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000799 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
800 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
801 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000802 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000803 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000804 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000805 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
806 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
807 "\u0939\u0948\u0902",
808 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000809
810 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000811 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000812 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
813 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000814
815 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000816 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
817 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
818 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000819 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
820 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000821
822 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000823 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
824 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
825 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
826 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000827 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000828
829 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000830 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
831 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
832 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
833 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
834 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000835 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000836
837 # (K) Vietnamese:
838 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
839 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000840 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
841 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
842 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
843 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000844 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000845
Martin v. Löwis2548c732003-04-18 10:39:54 +0000846 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000847 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000848 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000849
Martin v. Löwis2548c732003-04-18 10:39:54 +0000850 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000851 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
852 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
853 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000854 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000855
856 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000857 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
858 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
859 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000860 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000861
862 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000863 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000864 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000865
866 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000867 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
868 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000869 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000870
871 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000872 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000873 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000874
875 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000876 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000877 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000878
879 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000880 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
881 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000882 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000883 ]
884
885for i in punycode_testcases:
886 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000887 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000888
889class PunycodeTest(unittest.TestCase):
890 def test_encode(self):
891 for uni, puny in punycode_testcases:
892 # Need to convert both strings to lower case, since
893 # some of the extended encodings use upper case, but our
894 # code produces only lower case. Converting just puny to
895 # lower is also insufficient, since some of the input characters
896 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +0000897 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +0000898 str(uni.encode("punycode"), "ascii").lower(),
899 str(puny, "ascii").lower()
900 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000901
902 def test_decode(self):
903 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000904 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000905 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000906 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000907
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000908class UnicodeInternalTest(unittest.TestCase):
909 def test_bug1251300(self):
910 # Decoding with unicode_internal used to not correctly handle "code
911 # points" above 0x10ffff on UCS-4 builds.
912 if sys.maxunicode > 0xffff:
913 ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000914 (b"\x00\x10\xff\xff", "\U0010ffff"),
915 (b"\x00\x00\x01\x01", "\U00000101"),
916 (b"", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000917 ]
918 not_ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000919 b"\x7f\xff\xff\xff",
920 b"\x80\x00\x00\x00",
921 b"\x81\x00\x00\x00",
922 b"\x00",
923 b"\x00\x00\x00\x00\x00",
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000924 ]
925 for internal, uni in ok:
926 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000927 internal = bytes(reversed(internal))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000928 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000929 for internal in not_ok:
930 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000931 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000932 self.assertRaises(UnicodeDecodeError, internal.decode,
933 "unicode_internal")
934
935 def test_decode_error_attributes(self):
936 if sys.maxunicode > 0xffff:
937 try:
Walter Dörwald092a2252007-06-07 11:26:16 +0000938 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +0000939 except UnicodeDecodeError as ex:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000940 self.assertEqual("unicode_internal", ex.encoding)
941 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
942 self.assertEqual(4, ex.start)
943 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000944 else:
945 self.fail()
946
947 def test_decode_callback(self):
948 if sys.maxunicode > 0xffff:
949 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
950 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossum98297ee2007-11-06 21:34:58 +0000951 ab = "ab".encode("unicode_internal").decode()
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000952 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
953 "ascii"),
954 "UnicodeInternalTest")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000955 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000956
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000957 def test_encode_length(self):
958 # Issue 3739
959 encoder = codecs.getencoder("unicode_internal")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000960 self.assertEqual(encoder("a")[1], 1)
961 self.assertEqual(encoder("\xe9\u0142")[1], 2)
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000962
Ezio Melottib3aedd42010-11-20 19:04:17 +0000963 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +0000964
Martin v. Löwis2548c732003-04-18 10:39:54 +0000965# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
966nameprep_tests = [
967 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000968 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
969 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
970 b'\xb8\x8f\xef\xbb\xbf',
971 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000972 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000973 (b'CAFE',
974 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000975 # 3.3 Case folding 8bit U+00DF (german sharp s).
976 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000977 (b'\xc3\x9f',
978 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000979 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000980 (b'\xc4\xb0',
981 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000982 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000983 (b'\xc5\x83\xcd\xba',
984 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000985 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
986 # XXX: skip this as it fails in UCS-2 mode
987 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
988 # 'telc\xe2\x88\x95kg\xcf\x83'),
989 (None, None),
990 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000991 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
992 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000993 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000994 (b'\xe1\xbe\xb7',
995 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000996 # 3.9 Self-reverting case folding U+01F0 and normalization.
997 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000998 (b'\xc7\xb0',
999 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001000 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001001 (b'\xce\x90',
1002 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001003 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001004 (b'\xce\xb0',
1005 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001006 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001007 (b'\xe1\xba\x96',
1008 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001009 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001010 (b'\xe1\xbd\x96',
1011 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001012 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001013 (b' ',
1014 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001015 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001016 (b'\xc2\xa0',
1017 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001018 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001019 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001020 None),
1021 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001022 (b'\xe2\x80\x80',
1023 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001024 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001025 (b'\xe2\x80\x8b',
1026 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001027 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001028 (b'\xe3\x80\x80',
1029 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001030 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001031 (b'\x10\x7f',
1032 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001033 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001034 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001035 None),
1036 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001037 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001038 None),
1039 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001040 (b'\xef\xbb\xbf',
1041 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001042 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001043 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001044 None),
1045 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001046 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001047 None),
1048 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001049 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001050 None),
1051 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001052 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001053 None),
1054 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001055 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001056 None),
1057 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001058 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001059 None),
1060 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001061 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001062 None),
1063 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001064 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001065 None),
1066 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001067 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001068 None),
1069 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001070 (b'\xcd\x81',
1071 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001072 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001073 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001074 None),
1075 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001076 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001077 None),
1078 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001079 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001080 None),
1081 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001082 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001083 None),
1084 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001085 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001086 None),
1087 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001088 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001089 None),
1090 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001091 (b'foo\xef\xb9\xb6bar',
1092 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001093 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001094 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001095 None),
1096 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001097 (b'\xd8\xa71\xd8\xa8',
1098 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001099 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001100 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001101 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001102 # None),
1103 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001104 # 3.44 Larger test (shrinking).
1105 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001106 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1107 b'\xaa\xce\xb0\xe2\x80\x80',
1108 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001109 # 3.45 Larger test (expanding).
1110 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001111 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1112 b'\x80',
1113 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1114 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1115 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001116 ]
1117
1118
1119class NameprepTest(unittest.TestCase):
1120 def test_nameprep(self):
1121 from encodings.idna import nameprep
1122 for pos, (orig, prepped) in enumerate(nameprep_tests):
1123 if orig is None:
1124 # Skipped
1125 continue
1126 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001127 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001128 if prepped is None:
1129 # Input contains prohibited characters
1130 self.assertRaises(UnicodeError, nameprep, orig)
1131 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001132 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001133 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001134 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001135 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001136 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001137
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001138class IDNACodecTest(unittest.TestCase):
1139 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001140 self.assertEqual(str(b"python.org", "idna"), "python.org")
1141 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1142 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1143 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001144
1145 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001146 self.assertEqual("python.org".encode("idna"), b"python.org")
1147 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1148 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1149 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001150
Martin v. Löwis8b595142005-08-25 11:03:38 +00001151 def test_stream(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001152 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001153 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001154 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001155
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001156 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001157 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001158 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001159 "python.org"
1160 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001161 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001162 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001163 "python.org."
1164 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001165 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001166 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001167 "pyth\xf6n.org."
1168 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001169 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001170 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001171 "pyth\xf6n.org."
1172 )
1173
1174 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001175 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1176 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1177 self.assertEqual(decoder.decode(b"rg"), "")
1178 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001179
1180 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001181 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1182 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1183 self.assertEqual(decoder.decode(b"rg."), "org.")
1184 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001185
1186 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001187 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001188 b"".join(codecs.iterencode("python.org", "idna")),
1189 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001190 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001191 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001192 b"".join(codecs.iterencode("python.org.", "idna")),
1193 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001194 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001195 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001196 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1197 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001198 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001199 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001200 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1201 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001202 )
1203
1204 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001205 self.assertEqual(encoder.encode("\xe4x"), b"")
1206 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1207 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001208
1209 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001210 self.assertEqual(encoder.encode("\xe4x"), b"")
1211 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1212 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001213
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001214class CodecsModuleTest(unittest.TestCase):
1215
1216 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001217 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1218 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001219 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001220 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001221 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001222
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001223 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001224 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1225 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001226 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001227 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001228 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001229 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001230
1231 def test_register(self):
1232 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001233 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001234
1235 def test_lookup(self):
1236 self.assertRaises(TypeError, codecs.lookup)
1237 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001238 self.assertRaises(LookupError, codecs.lookup, " ")
1239
1240 def test_getencoder(self):
1241 self.assertRaises(TypeError, codecs.getencoder)
1242 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1243
1244 def test_getdecoder(self):
1245 self.assertRaises(TypeError, codecs.getdecoder)
1246 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1247
1248 def test_getreader(self):
1249 self.assertRaises(TypeError, codecs.getreader)
1250 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1251
1252 def test_getwriter(self):
1253 self.assertRaises(TypeError, codecs.getwriter)
1254 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001255
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001256 def test_lookup_issue1813(self):
1257 # Issue #1813: under Turkish locales, lookup of some codecs failed
1258 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitrou2a20f9b2011-07-27 01:06:07 +02001259 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001260 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1261 try:
1262 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1263 except locale.Error:
1264 # Unsupported locale on this system
1265 self.skipTest('test needs Turkish locale')
1266 c = codecs.lookup('ASCII')
1267 self.assertEqual(c.name, 'ascii')
1268
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001269class StreamReaderTest(unittest.TestCase):
1270
1271 def setUp(self):
1272 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001273 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001274
1275 def test_readlines(self):
1276 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001277 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001278
Thomas Wouters89f507f2006-12-13 04:49:30 +00001279class EncodedFileTest(unittest.TestCase):
1280
1281 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001282 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001283 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001284 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001285
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001286 f = io.BytesIO()
Thomas Wouters89f507f2006-12-13 04:49:30 +00001287 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001288 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001289 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001290
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001291all_unicode_encodings = [
1292 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001293 "big5",
1294 "big5hkscs",
1295 "charmap",
1296 "cp037",
1297 "cp1006",
1298 "cp1026",
1299 "cp1140",
1300 "cp1250",
1301 "cp1251",
1302 "cp1252",
1303 "cp1253",
1304 "cp1254",
1305 "cp1255",
1306 "cp1256",
1307 "cp1257",
1308 "cp1258",
1309 "cp424",
1310 "cp437",
1311 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001312 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001313 "cp737",
1314 "cp775",
1315 "cp850",
1316 "cp852",
1317 "cp855",
1318 "cp856",
1319 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001320 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001321 "cp860",
1322 "cp861",
1323 "cp862",
1324 "cp863",
1325 "cp864",
1326 "cp865",
1327 "cp866",
1328 "cp869",
1329 "cp874",
1330 "cp875",
1331 "cp932",
1332 "cp949",
1333 "cp950",
1334 "euc_jis_2004",
1335 "euc_jisx0213",
1336 "euc_jp",
1337 "euc_kr",
1338 "gb18030",
1339 "gb2312",
1340 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001341 "hp_roman8",
1342 "hz",
1343 "idna",
1344 "iso2022_jp",
1345 "iso2022_jp_1",
1346 "iso2022_jp_2",
1347 "iso2022_jp_2004",
1348 "iso2022_jp_3",
1349 "iso2022_jp_ext",
1350 "iso2022_kr",
1351 "iso8859_1",
1352 "iso8859_10",
1353 "iso8859_11",
1354 "iso8859_13",
1355 "iso8859_14",
1356 "iso8859_15",
1357 "iso8859_16",
1358 "iso8859_2",
1359 "iso8859_3",
1360 "iso8859_4",
1361 "iso8859_5",
1362 "iso8859_6",
1363 "iso8859_7",
1364 "iso8859_8",
1365 "iso8859_9",
1366 "johab",
1367 "koi8_r",
1368 "koi8_u",
1369 "latin_1",
1370 "mac_cyrillic",
1371 "mac_greek",
1372 "mac_iceland",
1373 "mac_latin2",
1374 "mac_roman",
1375 "mac_turkish",
1376 "palmos",
1377 "ptcp154",
1378 "punycode",
1379 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001380 "shift_jis",
1381 "shift_jis_2004",
1382 "shift_jisx0213",
1383 "tis_620",
1384 "unicode_escape",
1385 "unicode_internal",
1386 "utf_16",
1387 "utf_16_be",
1388 "utf_16_le",
1389 "utf_7",
1390 "utf_8",
1391]
1392
1393if hasattr(codecs, "mbcs_encode"):
1394 all_unicode_encodings.append("mbcs")
1395
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001396# The following encoding is not tested, because it's not supposed
1397# to work:
1398# "undefined"
1399
1400# The following encodings don't work in stateful mode
1401broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001402 "punycode",
1403 "unicode_internal"
1404]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001405broken_incremental_coders = broken_unicode_with_streams + [
1406 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001407]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001408
Walter Dörwald3abcb012007-04-16 22:10:50 +00001409class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001410 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001411 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001412 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001413 name = codecs.lookup(encoding).name
1414 if encoding.endswith("_codec"):
1415 name += "_codec"
1416 elif encoding == "latin_1":
1417 name = "latin_1"
1418 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001419 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001420 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001421 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001422 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1423
1424 if encoding not in broken_unicode_with_streams:
1425 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001426 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001427 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001428 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001429 for c in s:
1430 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001431 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001432 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001433 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001434 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001435 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001436 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001437 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001438 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001439 decodedresult += reader.read()
1440 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1441
Thomas Wouters89f507f2006-12-13 04:49:30 +00001442 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001443 # check incremental decoder/encoder (fetched via the Python
1444 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001445 try:
1446 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001447 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001448 except LookupError: # no IncrementalEncoder
1449 pass
1450 else:
1451 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001452 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001453 for c in s:
1454 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001455 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001456 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001457 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001458 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001459 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001460 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001461 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1462
1463 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001464 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001465 for c in s:
1466 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001467 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001468 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001469 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001470 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001471 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001472 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001473 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1474
1475 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001476 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001477 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1478
1479 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001480 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1481 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001482
Victor Stinner554f3f02010-06-16 23:33:54 +00001483 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001484 # check incremental decoder/encoder with errors argument
1485 try:
1486 encoder = codecs.getincrementalencoder(encoding)("ignore")
1487 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1488 except LookupError: # no IncrementalEncoder
1489 pass
1490 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001491 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001492 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001493 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001494 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1495
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001496 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001497 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001498 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001499 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1500
Walter Dörwald729c31f2005-03-14 19:06:30 +00001501 def test_seek(self):
1502 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001503 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001504 for encoding in all_unicode_encodings:
1505 if encoding == "idna": # FIXME: See SF bug #1163178
1506 continue
1507 if encoding in broken_unicode_with_streams:
1508 continue
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001509 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001510 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001511 # Test that calling seek resets the internal codec state and buffers
1512 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001513 data = reader.read()
1514 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001515
Walter Dörwalde22d3392005-11-17 08:52:34 +00001516 def test_bad_decode_args(self):
1517 for encoding in all_unicode_encodings:
1518 decoder = codecs.getdecoder(encoding)
1519 self.assertRaises(TypeError, decoder)
1520 if encoding not in ("idna", "punycode"):
1521 self.assertRaises(TypeError, decoder, 42)
1522
1523 def test_bad_encode_args(self):
1524 for encoding in all_unicode_encodings:
1525 encoder = codecs.getencoder(encoding)
1526 self.assertRaises(TypeError, encoder)
1527
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001528 def test_encoding_map_type_initialized(self):
1529 from encodings import cp1140
1530 # This used to crash, we are only verifying there's no crash.
1531 table_type = type(cp1140.encoding_table)
1532 self.assertEqual(table_type, table_type)
1533
Walter Dörwald3abcb012007-04-16 22:10:50 +00001534 def test_decoder_state(self):
1535 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001536 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001537 for encoding in all_unicode_encodings:
1538 if encoding not in broken_incremental_coders:
1539 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1540 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1541
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001542class CharmapTest(unittest.TestCase):
1543 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001544 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001545 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001546 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001547 )
1548
Ezio Melottib3aedd42010-11-20 19:04:17 +00001549 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001550 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001551 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001552 )
1553
Ezio Melottib3aedd42010-11-20 19:04:17 +00001554 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001555 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001556 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001557 )
1558
Ezio Melottib3aedd42010-11-20 19:04:17 +00001559 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001560 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001561 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001562 )
1563
Ezio Melottib3aedd42010-11-20 19:04:17 +00001564 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001565 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001566 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001567 )
1568
Guido van Rossum805365e2007-05-07 22:24:25 +00001569 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001570 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001571 codecs.charmap_decode(allbytes, "ignore", ""),
1572 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001573 )
1574
Thomas Wouters89f507f2006-12-13 04:49:30 +00001575class WithStmtTest(unittest.TestCase):
1576 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001577 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001578 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001579 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001580
1581 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001582 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001583 info = codecs.lookup("utf-8")
1584 with codecs.StreamReaderWriter(f, info.streamreader,
1585 info.streamwriter, 'strict') as srw:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001586 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001587
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001588class TypesTest(unittest.TestCase):
1589 def test_decode_unicode(self):
1590 # Most decoders don't accept unicode input
1591 decoders = [
1592 codecs.utf_7_decode,
1593 codecs.utf_8_decode,
1594 codecs.utf_16_le_decode,
1595 codecs.utf_16_be_decode,
1596 codecs.utf_16_ex_decode,
1597 codecs.utf_32_decode,
1598 codecs.utf_32_le_decode,
1599 codecs.utf_32_be_decode,
1600 codecs.utf_32_ex_decode,
1601 codecs.latin_1_decode,
1602 codecs.ascii_decode,
1603 codecs.charmap_decode,
1604 ]
1605 if hasattr(codecs, "mbcs_decode"):
1606 decoders.append(codecs.mbcs_decode)
1607 for decoder in decoders:
1608 self.assertRaises(TypeError, decoder, "xxx")
1609
1610 def test_unicode_escape(self):
1611 # Escape-decoding an unicode string is supported ang gives the same
1612 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001613 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1614 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1615 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1616 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001617
Martin v. Löwis43c57782009-05-10 08:15:24 +00001618class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001619
1620 def test_utf8(self):
1621 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001622 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001623 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001624 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001625 b"foo\x80bar")
1626 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001627 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001628 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001629 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001630 b"\xed\xb0\x80")
1631
1632 def test_ascii(self):
1633 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001634 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001635 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001636 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001637 b"foo\x80bar")
1638
1639 def test_charmap(self):
1640 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001641 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001642 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001643 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001644 b"foo\xa5bar")
1645
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001646 def test_latin1(self):
1647 # Issue6373
1648 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin1", "surrogateescape"),
1649 b"\xe4\xeb\xef\xf6\xfc")
1650
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001651
Victor Stinner3fed0872010-05-22 02:16:27 +00001652class BomTest(unittest.TestCase):
1653 def test_seek0(self):
1654 data = "1234567890"
1655 tests = ("utf-16",
1656 "utf-16-le",
1657 "utf-16-be",
1658 "utf-32",
1659 "utf-32-le",
1660 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02001661 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00001662 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001663 # Check if the BOM is written only once
1664 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001665 f.write(data)
1666 f.write(data)
1667 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001668 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001669 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001670 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001671
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001672 # Check that the BOM is written after a seek(0)
1673 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1674 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001675 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001676 f.seek(0)
1677 f.write(data)
1678 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001679 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001680
1681 # (StreamWriter) Check that the BOM is written after a seek(0)
1682 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1683 f.writer.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001684 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001685 f.writer.seek(0)
1686 f.writer.write(data)
1687 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001688 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001689
1690 # Check that the BOM is not written after a seek() at a position
1691 # different than the start
1692 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1693 f.write(data)
1694 f.seek(f.tell())
1695 f.write(data)
1696 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001697 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001698
1699 # (StreamWriter) Check that the BOM is not written after a seek()
1700 # at a position different than the start
1701 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1702 f.writer.write(data)
1703 f.writer.seek(f.writer.tell())
1704 f.writer.write(data)
1705 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001706 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001707
Victor Stinner3fed0872010-05-22 02:16:27 +00001708
Georg Brandl02524622010-12-02 18:06:51 +00001709bytes_transform_encodings = [
1710 "base64_codec",
1711 "uu_codec",
1712 "quopri_codec",
1713 "hex_codec",
1714]
1715try:
1716 import zlib
1717except ImportError:
1718 pass
1719else:
1720 bytes_transform_encodings.append("zlib_codec")
1721try:
1722 import bz2
1723except ImportError:
1724 pass
1725else:
1726 bytes_transform_encodings.append("bz2_codec")
1727
1728class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001729
Georg Brandl02524622010-12-02 18:06:51 +00001730 def test_basics(self):
1731 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00001732 for encoding in bytes_transform_encodings:
1733 # generic codecs interface
1734 (o, size) = codecs.getencoder(encoding)(binput)
1735 self.assertEqual(size, len(binput))
1736 (i, size) = codecs.getdecoder(encoding)(o)
1737 self.assertEqual(size, len(o))
1738 self.assertEqual(i, binput)
1739
Georg Brandl02524622010-12-02 18:06:51 +00001740 def test_read(self):
1741 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001742 sin = codecs.encode(b"\x80", encoding)
Georg Brandl02524622010-12-02 18:06:51 +00001743 reader = codecs.getreader(encoding)(io.BytesIO(sin))
1744 sout = reader.read()
1745 self.assertEqual(sout, b"\x80")
1746
1747 def test_readline(self):
1748 for encoding in bytes_transform_encodings:
1749 if encoding in ['uu_codec', 'zlib_codec']:
1750 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001751 sin = codecs.encode(b"\x80", encoding)
Georg Brandl02524622010-12-02 18:06:51 +00001752 reader = codecs.getreader(encoding)(io.BytesIO(sin))
1753 sout = reader.readline()
1754 self.assertEqual(sout, b"\x80")
1755
1756
Fred Drake2e2be372001-09-20 21:33:42 +00001757def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001758 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00001759 UTF32Test,
1760 UTF32LETest,
1761 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001762 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001763 UTF16LETest,
1764 UTF16BETest,
1765 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001766 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001767 UTF7Test,
1768 UTF16ExTest,
1769 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001770 RecodingTest,
1771 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001772 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001773 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001774 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001775 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001776 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001777 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001778 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001779 CharmapTest,
1780 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001781 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001782 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00001783 BomTest,
Georg Brandl02524622010-12-02 18:06:51 +00001784 TransformCodecTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001785 )
Fred Drake2e2be372001-09-20 21:33:42 +00001786
1787
1788if __name__ == "__main__":
1789 test_main()