blob: bc29e06c4f5007b2562f534e9bc8a076f5449558 [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Barry Warsaw04f357c2002-07-23 19:04:11 +00002import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00004import sys, _testcapi, io
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000010 def __init__(self, buffer):
11 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000012
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000019 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000020 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwald3abcb012007-04-16 22:10:50 +000026class MixInCheckStateHandling:
27 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000028 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000029 d = codecs.getincrementaldecoder(encoding)()
30 part1 = d.decode(s[:i])
31 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000032 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000033 # Check that the condition stated in the documentation for
34 # IncrementalDecoder.getstate() holds
35 if not state[1]:
36 # reset decoder to the default state without anything buffered
37 d.setstate((state[0][:0], 0))
38 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000039 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000040 # The decoder must return to the same state
41 self.assertEqual(state, d.getstate())
42 # Create a new decoder and set it to the state
43 # we extracted from the old one
44 d = codecs.getincrementaldecoder(encoding)()
45 d.setstate(state)
46 part2 = d.decode(s[i:], True)
47 self.assertEqual(u, part1+part2)
48
49 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000050 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000051 d = codecs.getincrementalencoder(encoding)()
52 part1 = d.encode(u[:i])
53 state = d.getstate()
54 d = codecs.getincrementalencoder(encoding)()
55 d.setstate(state)
56 part2 = d.encode(u[i:], True)
57 self.assertEqual(s, part1+part2)
58
59class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000060 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000061 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000062 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000063 # the StreamReader and check that the results equal the appropriate
64 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000065 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +000066 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000067 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000068 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000069 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000070 result += r.read()
71 self.assertEqual(result, partialresult)
72 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000073 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000074 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000075
Thomas Woutersa9773292006-04-21 09:43:23 +000076 # do the check again, this time using a incremental decoder
77 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000078 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000079 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000080 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000081 self.assertEqual(result, partialresult)
82 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000083 self.assertEqual(d.decode(b"", True), "")
84 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000085
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000086 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000087 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000088 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000089 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000090 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000091 self.assertEqual(result, partialresult)
92 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000093 self.assertEqual(d.decode(b"", True), "")
94 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000095
96 # check iterdecode()
97 encoded = input.encode(self.encoding)
98 self.assertEqual(
99 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000100 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000101 )
102
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000103 def test_readline(self):
104 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000105 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000106 return codecs.getreader(self.encoding)(stream)
107
Walter Dörwaldca199432006-03-06 22:39:12 +0000108 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000109 reader = getreader(input)
110 lines = []
111 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000112 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000113 if not line:
114 break
115 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000116 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000117
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000118 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
119 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
120 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000121 self.assertEqual(readalllines(s, True), sexpected)
122 self.assertEqual(readalllines(s, False), sexpectednoends)
123 self.assertEqual(readalllines(s, True, 10), sexpected)
124 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000125
126 # Test long lines (multiple calls to read() in readline())
127 vw = []
128 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000129 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
130 vw.append((i*200)*"\3042" + lineend)
131 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000132 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
133 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
134
135 # Test lines where the first read might end with \r, so the
136 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000137 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000138 for lineend in "\n \r\n \r \u2028".split():
139 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000140 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000141 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000142 self.assertEqual(
143 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000144 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000145 )
146 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000147 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000148 self.assertEqual(
149 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000150 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000151 )
152
153 def test_bug1175396(self):
154 s = [
155 '<%!--===================================================\r\n',
156 ' BLOG index page: show recent articles,\r\n',
157 ' today\'s articles, or articles of a specific date.\r\n',
158 '========================================================--%>\r\n',
159 '<%@inputencoding="ISO-8859-1"%>\r\n',
160 '<%@pagetemplate=TEMPLATE.y%>\r\n',
161 '<%@import=import frog.util, frog%>\r\n',
162 '<%@import=import frog.objects%>\r\n',
163 '<%@import=from frog.storageerrors import StorageError%>\r\n',
164 '<%\r\n',
165 '\r\n',
166 'import logging\r\n',
167 'log=logging.getLogger("Snakelets.logger")\r\n',
168 '\r\n',
169 '\r\n',
170 'user=self.SessionCtx.user\r\n',
171 'storageEngine=self.SessionCtx.storageEngine\r\n',
172 '\r\n',
173 '\r\n',
174 'def readArticlesFromDate(date, count=None):\r\n',
175 ' entryids=storageEngine.listBlogEntries(date)\r\n',
176 ' entryids.reverse() # descending\r\n',
177 ' if count:\r\n',
178 ' entryids=entryids[:count]\r\n',
179 ' try:\r\n',
180 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
181 ' except StorageError,x:\r\n',
182 ' log.error("Error loading articles: "+str(x))\r\n',
183 ' self.abort("cannot load articles")\r\n',
184 '\r\n',
185 'showdate=None\r\n',
186 '\r\n',
187 'arg=self.Request.getArg()\r\n',
188 'if arg=="today":\r\n',
189 ' #-------------------- TODAY\'S ARTICLES\r\n',
190 ' self.write("<h2>Today\'s articles</h2>")\r\n',
191 ' showdate = frog.util.isodatestr() \r\n',
192 ' entries = readArticlesFromDate(showdate)\r\n',
193 'elif arg=="active":\r\n',
194 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
195 ' self.Yredirect("active.y")\r\n',
196 'elif arg=="login":\r\n',
197 ' #-------------------- LOGIN PAGE redirect\r\n',
198 ' self.Yredirect("login.y")\r\n',
199 'elif arg=="date":\r\n',
200 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
201 ' showdate = self.Request.getParameter("date")\r\n',
202 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
203 ' entries = readArticlesFromDate(showdate)\r\n',
204 'else:\r\n',
205 ' #-------------------- RECENT ARTICLES\r\n',
206 ' self.write("<h2>Recent articles</h2>")\r\n',
207 ' dates=storageEngine.listBlogEntryDates()\r\n',
208 ' if dates:\r\n',
209 ' entries=[]\r\n',
210 ' SHOWAMOUNT=10\r\n',
211 ' for showdate in dates:\r\n',
212 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
213 ' if len(entries)>=SHOWAMOUNT:\r\n',
214 ' break\r\n',
215 ' \r\n',
216 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000217 stream = io.BytesIO("".join(s).encode(self.encoding))
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000218 reader = codecs.getreader(self.encoding)(stream)
219 for (i, line) in enumerate(reader):
220 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000221
222 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000223 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000224 writer = codecs.getwriter(self.encoding)(q)
225 reader = codecs.getreader(self.encoding)(q)
226
227 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000228 writer.write("foo\r")
229 self.assertEqual(reader.readline(keepends=False), "foo")
230 writer.write("\nbar\r")
231 self.assertEqual(reader.readline(keepends=False), "")
232 self.assertEqual(reader.readline(keepends=False), "bar")
233 writer.write("baz")
234 self.assertEqual(reader.readline(keepends=False), "baz")
235 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000236
237 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000238 writer.write("foo\r")
239 self.assertEqual(reader.readline(keepends=True), "foo\r")
240 writer.write("\nbar\r")
241 self.assertEqual(reader.readline(keepends=True), "\n")
242 self.assertEqual(reader.readline(keepends=True), "bar\r")
243 writer.write("baz")
244 self.assertEqual(reader.readline(keepends=True), "baz")
245 self.assertEqual(reader.readline(keepends=True), "")
246 writer.write("foo\r\n")
247 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000248
Walter Dörwald9fa09462005-01-10 12:01:39 +0000249 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000250 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
251 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
252 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000253
254 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000255 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000256 reader = codecs.getreader(self.encoding)(stream)
257 self.assertEqual(reader.readline(), s1)
258 self.assertEqual(reader.readline(), s2)
259 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000260 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000261
262 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000263 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
264 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
265 s3 = "stillokay:bbbbxx\r\n"
266 s4 = "broken!!!!badbad\r\n"
267 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000268
269 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000270 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000271 reader = codecs.getreader(self.encoding)(stream)
272 self.assertEqual(reader.readline(), s1)
273 self.assertEqual(reader.readline(), s2)
274 self.assertEqual(reader.readline(), s3)
275 self.assertEqual(reader.readline(), s4)
276 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000277 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000278
Walter Dörwald41980ca2007-08-16 21:55:45 +0000279class UTF32Test(ReadTest):
280 encoding = "utf-32"
281
282 spamle = (b'\xff\xfe\x00\x00'
283 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
284 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
285 spambe = (b'\x00\x00\xfe\xff'
286 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
287 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
288
289 def test_only_one_bom(self):
290 _,_,reader,writer = codecs.lookup(self.encoding)
291 # encode some stream
292 s = io.BytesIO()
293 f = writer(s)
294 f.write("spam")
295 f.write("spam")
296 d = s.getvalue()
297 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000298 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000299 # try to read it back
300 s = io.BytesIO(d)
301 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000302 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000303
304 def test_badbom(self):
305 s = io.BytesIO(4*b"\xff")
306 f = codecs.getreader(self.encoding)(s)
307 self.assertRaises(UnicodeError, f.read)
308
309 s = io.BytesIO(8*b"\xff")
310 f = codecs.getreader(self.encoding)(s)
311 self.assertRaises(UnicodeError, f.read)
312
313 def test_partial(self):
314 self.check_partial(
315 "\x00\xff\u0100\uffff",
316 [
317 "", # first byte of BOM read
318 "", # second byte of BOM read
319 "", # third byte of BOM read
320 "", # fourth byte of BOM read => byteorder known
321 "",
322 "",
323 "",
324 "\x00",
325 "\x00",
326 "\x00",
327 "\x00",
328 "\x00\xff",
329 "\x00\xff",
330 "\x00\xff",
331 "\x00\xff",
332 "\x00\xff\u0100",
333 "\x00\xff\u0100",
334 "\x00\xff\u0100",
335 "\x00\xff\u0100",
336 "\x00\xff\u0100\uffff",
337 ]
338 )
339
Georg Brandl791f4e12009-09-17 11:41:24 +0000340 def test_handlers(self):
341 self.assertEqual(('\ufffd', 1),
342 codecs.utf_32_decode(b'\x01', 'replace', True))
343 self.assertEqual(('', 1),
344 codecs.utf_32_decode(b'\x01', 'ignore', True))
345
Walter Dörwald41980ca2007-08-16 21:55:45 +0000346 def test_errors(self):
347 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
348 b"\xff", "strict", True)
349
350 def test_decoder_state(self):
351 self.check_state_handling_decode(self.encoding,
352 "spamspam", self.spamle)
353 self.check_state_handling_decode(self.encoding,
354 "spamspam", self.spambe)
355
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000356 def test_issue8941(self):
357 # Issue #8941: insufficient result allocation when decoding into
358 # surrogate pairs on UCS-2 builds.
359 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
360 self.assertEqual('\U00010000' * 1024,
361 codecs.utf_32_decode(encoded_le)[0])
362 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
363 self.assertEqual('\U00010000' * 1024,
364 codecs.utf_32_decode(encoded_be)[0])
365
Walter Dörwald41980ca2007-08-16 21:55:45 +0000366class UTF32LETest(ReadTest):
367 encoding = "utf-32-le"
368
369 def test_partial(self):
370 self.check_partial(
371 "\x00\xff\u0100\uffff",
372 [
373 "",
374 "",
375 "",
376 "\x00",
377 "\x00",
378 "\x00",
379 "\x00",
380 "\x00\xff",
381 "\x00\xff",
382 "\x00\xff",
383 "\x00\xff",
384 "\x00\xff\u0100",
385 "\x00\xff\u0100",
386 "\x00\xff\u0100",
387 "\x00\xff\u0100",
388 "\x00\xff\u0100\uffff",
389 ]
390 )
391
392 def test_simple(self):
393 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
394
395 def test_errors(self):
396 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
397 b"\xff", "strict", True)
398
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000399 def test_issue8941(self):
400 # Issue #8941: insufficient result allocation when decoding into
401 # surrogate pairs on UCS-2 builds.
402 encoded = b'\x00\x00\x01\x00' * 1024
403 self.assertEqual('\U00010000' * 1024,
404 codecs.utf_32_le_decode(encoded)[0])
405
Walter Dörwald41980ca2007-08-16 21:55:45 +0000406class UTF32BETest(ReadTest):
407 encoding = "utf-32-be"
408
409 def test_partial(self):
410 self.check_partial(
411 "\x00\xff\u0100\uffff",
412 [
413 "",
414 "",
415 "",
416 "\x00",
417 "\x00",
418 "\x00",
419 "\x00",
420 "\x00\xff",
421 "\x00\xff",
422 "\x00\xff",
423 "\x00\xff",
424 "\x00\xff\u0100",
425 "\x00\xff\u0100",
426 "\x00\xff\u0100",
427 "\x00\xff\u0100",
428 "\x00\xff\u0100\uffff",
429 ]
430 )
431
432 def test_simple(self):
433 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
434
435 def test_errors(self):
436 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
437 b"\xff", "strict", True)
438
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000439 def test_issue8941(self):
440 # Issue #8941: insufficient result allocation when decoding into
441 # surrogate pairs on UCS-2 builds.
442 encoded = b'\x00\x01\x00\x00' * 1024
443 self.assertEqual('\U00010000' * 1024,
444 codecs.utf_32_be_decode(encoded)[0])
445
446
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000447class UTF16Test(ReadTest):
448 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000449
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000450 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
451 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000452
453 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000454 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000455 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000456 s = io.BytesIO()
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000457 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000458 f.write("spam")
459 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000460 d = s.getvalue()
461 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000462 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000463 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000464 s = io.BytesIO(d)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000465 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000466 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000467
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000468 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000469 s = io.BytesIO(b"\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000470 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000471 self.assertRaises(UnicodeError, f.read)
472
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000473 s = io.BytesIO(b"\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000474 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000475 self.assertRaises(UnicodeError, f.read)
476
Walter Dörwald69652032004-09-07 20:24:22 +0000477 def test_partial(self):
478 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000479 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000480 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000481 "", # first byte of BOM read
482 "", # second byte of BOM read => byteorder known
483 "",
484 "\x00",
485 "\x00",
486 "\x00\xff",
487 "\x00\xff",
488 "\x00\xff\u0100",
489 "\x00\xff\u0100",
490 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000491 ]
492 )
493
Georg Brandl791f4e12009-09-17 11:41:24 +0000494 def test_handlers(self):
495 self.assertEqual(('\ufffd', 1),
496 codecs.utf_16_decode(b'\x01', 'replace', True))
497 self.assertEqual(('', 1),
498 codecs.utf_16_decode(b'\x01', 'ignore', True))
499
Walter Dörwalde22d3392005-11-17 08:52:34 +0000500 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000501 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000502 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000503
504 def test_decoder_state(self):
505 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000506 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000507 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000508 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000509
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000510 def test_bug691291(self):
511 # Files are always opened in binary mode, even if no binary mode was
512 # specified. This means that no automatic conversion of '\n' is done
513 # on reading and writing.
514 s1 = 'Hello\r\nworld\r\n'
515
516 s = s1.encode(self.encoding)
517 try:
518 with open(support.TESTFN, 'wb') as fp:
519 fp.write(s)
520 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
521 self.assertEqual(reader.read(), s1)
522 finally:
523 support.unlink(support.TESTFN)
524
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000525class UTF16LETest(ReadTest):
526 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000527
528 def test_partial(self):
529 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000530 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000531 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000532 "",
533 "\x00",
534 "\x00",
535 "\x00\xff",
536 "\x00\xff",
537 "\x00\xff\u0100",
538 "\x00\xff\u0100",
539 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000540 ]
541 )
542
Walter Dörwalde22d3392005-11-17 08:52:34 +0000543 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000544 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000545 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000546
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000547class UTF16BETest(ReadTest):
548 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000549
550 def test_partial(self):
551 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000552 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000553 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000554 "",
555 "\x00",
556 "\x00",
557 "\x00\xff",
558 "\x00\xff",
559 "\x00\xff\u0100",
560 "\x00\xff\u0100",
561 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000562 ]
563 )
564
Walter Dörwalde22d3392005-11-17 08:52:34 +0000565 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000566 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000567 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000568
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000569class UTF8Test(ReadTest):
570 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000571
572 def test_partial(self):
573 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000574 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000575 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000576 "\x00",
577 "\x00",
578 "\x00\xff",
579 "\x00\xff",
580 "\x00\xff\u07ff",
581 "\x00\xff\u07ff",
582 "\x00\xff\u07ff",
583 "\x00\xff\u07ff\u0800",
584 "\x00\xff\u07ff\u0800",
585 "\x00\xff\u07ff\u0800",
586 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000587 ]
588 )
589
Walter Dörwald3abcb012007-04-16 22:10:50 +0000590 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000591 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000592 self.check_state_handling_decode(self.encoding,
593 u, u.encode(self.encoding))
594
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000595 def test_lone_surrogates(self):
596 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
597 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000598 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
599 b'[\\udc80]')
600 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
601 b'[&#56448;]')
602 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
603 b'[\x80]')
604 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
605 b'[]')
606 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
607 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000608
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000609 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000610 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
611 b"abc\xed\xa0\x80def")
612 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
613 "abc\ud800def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000614 self.assertTrue(codecs.lookup_error("surrogatepass"))
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000615
Walter Dörwalde22d3392005-11-17 08:52:34 +0000616class UTF7Test(ReadTest):
617 encoding = "utf-7"
618
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000619 def test_partial(self):
620 self.check_partial(
621 "a+-b",
622 [
623 "a",
624 "a",
625 "a+",
626 "a+-",
627 "a+-b",
628 ]
629 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000630
631class UTF16ExTest(unittest.TestCase):
632
633 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000634 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000635
636 def test_bad_args(self):
637 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
638
639class ReadBufferTest(unittest.TestCase):
640
641 def test_array(self):
642 import array
643 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000644 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000645 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000646 )
647
648 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000649 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000650
651 def test_bad_args(self):
652 self.assertRaises(TypeError, codecs.readbuffer_encode)
653 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
654
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000655class UTF8SigTest(ReadTest):
656 encoding = "utf-8-sig"
657
658 def test_partial(self):
659 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000660 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000661 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000662 "",
663 "",
664 "", # First BOM has been read and skipped
665 "",
666 "",
667 "\ufeff", # Second BOM has been read and emitted
668 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000669 "\ufeff\x00", # First byte of encoded "\xff" read
670 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
671 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
672 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000673 "\ufeff\x00\xff\u07ff",
674 "\ufeff\x00\xff\u07ff",
675 "\ufeff\x00\xff\u07ff\u0800",
676 "\ufeff\x00\xff\u07ff\u0800",
677 "\ufeff\x00\xff\u07ff\u0800",
678 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000679 ]
680 )
681
Thomas Wouters89f507f2006-12-13 04:49:30 +0000682 def test_bug1601501(self):
683 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000684 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000685
Walter Dörwald3abcb012007-04-16 22:10:50 +0000686 def test_bom(self):
687 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000688 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000689 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
690
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000691 def test_stream_bom(self):
692 unistring = "ABC\u00A1\u2200XYZ"
693 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
694
695 reader = codecs.getreader("utf-8-sig")
696 for sizehint in [None] + list(range(1, 11)) + \
697 [64, 128, 256, 512, 1024]:
698 istream = reader(io.BytesIO(bytestring))
699 ostream = io.StringIO()
700 while 1:
701 if sizehint is not None:
702 data = istream.read(sizehint)
703 else:
704 data = istream.read()
705
706 if not data:
707 break
708 ostream.write(data)
709
710 got = ostream.getvalue()
711 self.assertEqual(got, unistring)
712
713 def test_stream_bare(self):
714 unistring = "ABC\u00A1\u2200XYZ"
715 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
716
717 reader = codecs.getreader("utf-8-sig")
718 for sizehint in [None] + list(range(1, 11)) + \
719 [64, 128, 256, 512, 1024]:
720 istream = reader(io.BytesIO(bytestring))
721 ostream = io.StringIO()
722 while 1:
723 if sizehint is not None:
724 data = istream.read(sizehint)
725 else:
726 data = istream.read()
727
728 if not data:
729 break
730 ostream.write(data)
731
732 got = ostream.getvalue()
733 self.assertEqual(got, unistring)
734
735class EscapeDecodeTest(unittest.TestCase):
736 def test_empty(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000737 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000738
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000739class RecodingTest(unittest.TestCase):
740 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000741 f = io.BytesIO()
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000742 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000743 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000744 f2.close()
745 # Python used to crash on this at exit because of a refcount
746 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000747
Martin v. Löwis2548c732003-04-18 10:39:54 +0000748# From RFC 3492
749punycode_testcases = [
750 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000751 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
752 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000753 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000754 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000755 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000756 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000757 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000758 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000759 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000760 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000761 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
762 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
763 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000764 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000765 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000766 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
767 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
768 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000769 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000770 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000771 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000772 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
773 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
774 "\u0939\u0948\u0902",
775 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000776
777 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000778 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000779 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
780 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000781
782 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000783 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
784 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
785 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000786 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
787 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000788
789 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000790 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
791 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
792 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
793 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000794 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000795
796 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000797 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
798 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
799 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
800 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
801 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000802 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000803
804 # (K) Vietnamese:
805 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
806 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000807 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
808 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
809 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
810 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000811 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000812
Martin v. Löwis2548c732003-04-18 10:39:54 +0000813 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000814 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000815 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000816
Martin v. Löwis2548c732003-04-18 10:39:54 +0000817 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000818 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
819 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
820 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000821 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000822
823 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000824 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
825 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
826 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000827 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000828
829 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000830 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000831 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000832
833 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000834 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
835 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000836 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000837
838 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000839 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000840 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000841
842 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000843 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000844 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000845
846 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000847 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
848 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000849 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000850 ]
851
852for i in punycode_testcases:
853 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000854 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000855
856class PunycodeTest(unittest.TestCase):
857 def test_encode(self):
858 for uni, puny in punycode_testcases:
859 # Need to convert both strings to lower case, since
860 # some of the extended encodings use upper case, but our
861 # code produces only lower case. Converting just puny to
862 # lower is also insufficient, since some of the input characters
863 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +0000864 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +0000865 str(uni.encode("punycode"), "ascii").lower(),
866 str(puny, "ascii").lower()
867 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000868
869 def test_decode(self):
870 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000871 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000872 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000873 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000874
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000875class UnicodeInternalTest(unittest.TestCase):
876 def test_bug1251300(self):
877 # Decoding with unicode_internal used to not correctly handle "code
878 # points" above 0x10ffff on UCS-4 builds.
879 if sys.maxunicode > 0xffff:
880 ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000881 (b"\x00\x10\xff\xff", "\U0010ffff"),
882 (b"\x00\x00\x01\x01", "\U00000101"),
883 (b"", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000884 ]
885 not_ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000886 b"\x7f\xff\xff\xff",
887 b"\x80\x00\x00\x00",
888 b"\x81\x00\x00\x00",
889 b"\x00",
890 b"\x00\x00\x00\x00\x00",
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000891 ]
892 for internal, uni in ok:
893 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000894 internal = bytes(reversed(internal))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000895 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000896 for internal in not_ok:
897 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000898 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000899 self.assertRaises(UnicodeDecodeError, internal.decode,
900 "unicode_internal")
901
902 def test_decode_error_attributes(self):
903 if sys.maxunicode > 0xffff:
904 try:
Walter Dörwald092a2252007-06-07 11:26:16 +0000905 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +0000906 except UnicodeDecodeError as ex:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000907 self.assertEqual("unicode_internal", ex.encoding)
908 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
909 self.assertEqual(4, ex.start)
910 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000911 else:
912 self.fail()
913
914 def test_decode_callback(self):
915 if sys.maxunicode > 0xffff:
916 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
917 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossum98297ee2007-11-06 21:34:58 +0000918 ab = "ab".encode("unicode_internal").decode()
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000919 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
920 "ascii"),
921 "UnicodeInternalTest")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000922 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000923
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000924 def test_encode_length(self):
925 # Issue 3739
926 encoder = codecs.getencoder("unicode_internal")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000927 self.assertEqual(encoder("a")[1], 1)
928 self.assertEqual(encoder("\xe9\u0142")[1], 2)
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000929
Ezio Melottib3aedd42010-11-20 19:04:17 +0000930 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +0000931
Martin v. Löwis2548c732003-04-18 10:39:54 +0000932# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
933nameprep_tests = [
934 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000935 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
936 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
937 b'\xb8\x8f\xef\xbb\xbf',
938 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000939 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000940 (b'CAFE',
941 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000942 # 3.3 Case folding 8bit U+00DF (german sharp s).
943 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000944 (b'\xc3\x9f',
945 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000946 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000947 (b'\xc4\xb0',
948 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000949 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000950 (b'\xc5\x83\xcd\xba',
951 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000952 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
953 # XXX: skip this as it fails in UCS-2 mode
954 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
955 # 'telc\xe2\x88\x95kg\xcf\x83'),
956 (None, None),
957 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000958 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
959 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000960 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000961 (b'\xe1\xbe\xb7',
962 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000963 # 3.9 Self-reverting case folding U+01F0 and normalization.
964 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000965 (b'\xc7\xb0',
966 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000967 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000968 (b'\xce\x90',
969 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000970 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000971 (b'\xce\xb0',
972 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000973 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000974 (b'\xe1\xba\x96',
975 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000976 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000977 (b'\xe1\xbd\x96',
978 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000979 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000980 (b' ',
981 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000982 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000983 (b'\xc2\xa0',
984 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000985 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000986 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000987 None),
988 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000989 (b'\xe2\x80\x80',
990 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000991 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000992 (b'\xe2\x80\x8b',
993 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000994 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000995 (b'\xe3\x80\x80',
996 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000997 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000998 (b'\x10\x7f',
999 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001000 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001001 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001002 None),
1003 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001004 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001005 None),
1006 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001007 (b'\xef\xbb\xbf',
1008 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001009 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001010 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001011 None),
1012 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001013 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001014 None),
1015 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001016 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001017 None),
1018 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001019 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001020 None),
1021 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001022 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001023 None),
1024 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001025 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001026 None),
1027 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001028 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001029 None),
1030 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001031 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001032 None),
1033 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001034 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001035 None),
1036 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001037 (b'\xcd\x81',
1038 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001039 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001040 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001041 None),
1042 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001043 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001044 None),
1045 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001046 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001047 None),
1048 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001049 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001050 None),
1051 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001052 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001053 None),
1054 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001055 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001056 None),
1057 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001058 (b'foo\xef\xb9\xb6bar',
1059 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001060 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001061 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001062 None),
1063 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001064 (b'\xd8\xa71\xd8\xa8',
1065 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001066 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001067 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001068 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001069 # None),
1070 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001071 # 3.44 Larger test (shrinking).
1072 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001073 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1074 b'\xaa\xce\xb0\xe2\x80\x80',
1075 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001076 # 3.45 Larger test (expanding).
1077 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001078 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1079 b'\x80',
1080 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1081 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1082 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001083 ]
1084
1085
1086class NameprepTest(unittest.TestCase):
1087 def test_nameprep(self):
1088 from encodings.idna import nameprep
1089 for pos, (orig, prepped) in enumerate(nameprep_tests):
1090 if orig is None:
1091 # Skipped
1092 continue
1093 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001094 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001095 if prepped is None:
1096 # Input contains prohibited characters
1097 self.assertRaises(UnicodeError, nameprep, orig)
1098 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001099 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001100 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001101 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001102 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001103 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001104
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001105class IDNACodecTest(unittest.TestCase):
1106 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001107 self.assertEqual(str(b"python.org", "idna"), "python.org")
1108 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1109 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1110 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001111
1112 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001113 self.assertEqual("python.org".encode("idna"), b"python.org")
1114 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1115 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1116 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001117
Martin v. Löwis8b595142005-08-25 11:03:38 +00001118 def test_stream(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001119 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001120 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001121 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001122
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001123 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001124 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001125 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001126 "python.org"
1127 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001128 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001129 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001130 "python.org."
1131 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001132 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001133 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001134 "pyth\xf6n.org."
1135 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001136 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001137 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001138 "pyth\xf6n.org."
1139 )
1140
1141 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001142 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1143 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1144 self.assertEqual(decoder.decode(b"rg"), "")
1145 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001146
1147 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001148 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1149 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1150 self.assertEqual(decoder.decode(b"rg."), "org.")
1151 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001152
1153 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001154 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001155 b"".join(codecs.iterencode("python.org", "idna")),
1156 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001157 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001158 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001159 b"".join(codecs.iterencode("python.org.", "idna")),
1160 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001161 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001162 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001163 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1164 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001165 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001166 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001167 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1168 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001169 )
1170
1171 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001172 self.assertEqual(encoder.encode("\xe4x"), b"")
1173 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1174 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001175
1176 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001177 self.assertEqual(encoder.encode("\xe4x"), b"")
1178 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1179 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001180
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001181class CodecsModuleTest(unittest.TestCase):
1182
1183 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001184 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1185 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001186 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001187 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001188 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001189
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001190 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001191 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1192 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001193 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001194 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001195 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001196 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001197
1198 def test_register(self):
1199 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001200 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001201
1202 def test_lookup(self):
1203 self.assertRaises(TypeError, codecs.lookup)
1204 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001205 self.assertRaises(LookupError, codecs.lookup, " ")
1206
1207 def test_getencoder(self):
1208 self.assertRaises(TypeError, codecs.getencoder)
1209 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1210
1211 def test_getdecoder(self):
1212 self.assertRaises(TypeError, codecs.getdecoder)
1213 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1214
1215 def test_getreader(self):
1216 self.assertRaises(TypeError, codecs.getreader)
1217 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1218
1219 def test_getwriter(self):
1220 self.assertRaises(TypeError, codecs.getwriter)
1221 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001222
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001223class StreamReaderTest(unittest.TestCase):
1224
1225 def setUp(self):
1226 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001227 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001228
1229 def test_readlines(self):
1230 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001231 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001232
Thomas Wouters89f507f2006-12-13 04:49:30 +00001233class EncodedFileTest(unittest.TestCase):
1234
1235 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001236 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001237 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001238 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001239
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001240 f = io.BytesIO()
Thomas Wouters89f507f2006-12-13 04:49:30 +00001241 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001242 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001243 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001244
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001245all_unicode_encodings = [
1246 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001247 "big5",
1248 "big5hkscs",
1249 "charmap",
1250 "cp037",
1251 "cp1006",
1252 "cp1026",
1253 "cp1140",
1254 "cp1250",
1255 "cp1251",
1256 "cp1252",
1257 "cp1253",
1258 "cp1254",
1259 "cp1255",
1260 "cp1256",
1261 "cp1257",
1262 "cp1258",
1263 "cp424",
1264 "cp437",
1265 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001266 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001267 "cp737",
1268 "cp775",
1269 "cp850",
1270 "cp852",
1271 "cp855",
1272 "cp856",
1273 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001274 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001275 "cp860",
1276 "cp861",
1277 "cp862",
1278 "cp863",
1279 "cp864",
1280 "cp865",
1281 "cp866",
1282 "cp869",
1283 "cp874",
1284 "cp875",
1285 "cp932",
1286 "cp949",
1287 "cp950",
1288 "euc_jis_2004",
1289 "euc_jisx0213",
1290 "euc_jp",
1291 "euc_kr",
1292 "gb18030",
1293 "gb2312",
1294 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001295 "hp_roman8",
1296 "hz",
1297 "idna",
1298 "iso2022_jp",
1299 "iso2022_jp_1",
1300 "iso2022_jp_2",
1301 "iso2022_jp_2004",
1302 "iso2022_jp_3",
1303 "iso2022_jp_ext",
1304 "iso2022_kr",
1305 "iso8859_1",
1306 "iso8859_10",
1307 "iso8859_11",
1308 "iso8859_13",
1309 "iso8859_14",
1310 "iso8859_15",
1311 "iso8859_16",
1312 "iso8859_2",
1313 "iso8859_3",
1314 "iso8859_4",
1315 "iso8859_5",
1316 "iso8859_6",
1317 "iso8859_7",
1318 "iso8859_8",
1319 "iso8859_9",
1320 "johab",
1321 "koi8_r",
1322 "koi8_u",
1323 "latin_1",
1324 "mac_cyrillic",
1325 "mac_greek",
1326 "mac_iceland",
1327 "mac_latin2",
1328 "mac_roman",
1329 "mac_turkish",
1330 "palmos",
1331 "ptcp154",
1332 "punycode",
1333 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001334 "shift_jis",
1335 "shift_jis_2004",
1336 "shift_jisx0213",
1337 "tis_620",
1338 "unicode_escape",
1339 "unicode_internal",
1340 "utf_16",
1341 "utf_16_be",
1342 "utf_16_le",
1343 "utf_7",
1344 "utf_8",
1345]
1346
1347if hasattr(codecs, "mbcs_encode"):
1348 all_unicode_encodings.append("mbcs")
1349
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001350# The following encoding is not tested, because it's not supposed
1351# to work:
1352# "undefined"
1353
1354# The following encodings don't work in stateful mode
1355broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001356 "punycode",
1357 "unicode_internal"
1358]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001359broken_incremental_coders = broken_unicode_with_streams + [
1360 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001361]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001362
Walter Dörwald3abcb012007-04-16 22:10:50 +00001363class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001364 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001365 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001366 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001367 name = codecs.lookup(encoding).name
1368 if encoding.endswith("_codec"):
1369 name += "_codec"
1370 elif encoding == "latin_1":
1371 name = "latin_1"
1372 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001373 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001374 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001375 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001376 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1377
1378 if encoding not in broken_unicode_with_streams:
1379 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001380 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001381 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001382 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001383 for c in s:
1384 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001385 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001386 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001387 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001388 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001389 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001390 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001391 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001392 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001393 decodedresult += reader.read()
1394 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1395
Thomas Wouters89f507f2006-12-13 04:49:30 +00001396 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001397 # check incremental decoder/encoder (fetched via the Python
1398 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001399 try:
1400 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001401 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001402 except LookupError: # no IncrementalEncoder
1403 pass
1404 else:
1405 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001406 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001407 for c in s:
1408 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001409 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001410 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001411 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001412 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001413 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001414 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001415 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1416
1417 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001418 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001419 for c in s:
1420 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001421 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001422 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001423 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001424 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001425 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001426 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001427 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1428
1429 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001430 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001431 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1432
1433 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001434 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1435 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001436
Victor Stinner554f3f02010-06-16 23:33:54 +00001437 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001438 # check incremental decoder/encoder with errors argument
1439 try:
1440 encoder = codecs.getincrementalencoder(encoding)("ignore")
1441 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1442 except LookupError: # no IncrementalEncoder
1443 pass
1444 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001445 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001446 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001447 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001448 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1449
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001450 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001451 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001452 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001453 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1454
Walter Dörwald729c31f2005-03-14 19:06:30 +00001455 def test_seek(self):
1456 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001457 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001458 for encoding in all_unicode_encodings:
1459 if encoding == "idna": # FIXME: See SF bug #1163178
1460 continue
1461 if encoding in broken_unicode_with_streams:
1462 continue
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001463 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001464 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001465 # Test that calling seek resets the internal codec state and buffers
1466 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001467 data = reader.read()
1468 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001469
Walter Dörwalde22d3392005-11-17 08:52:34 +00001470 def test_bad_decode_args(self):
1471 for encoding in all_unicode_encodings:
1472 decoder = codecs.getdecoder(encoding)
1473 self.assertRaises(TypeError, decoder)
1474 if encoding not in ("idna", "punycode"):
1475 self.assertRaises(TypeError, decoder, 42)
1476
1477 def test_bad_encode_args(self):
1478 for encoding in all_unicode_encodings:
1479 encoder = codecs.getencoder(encoding)
1480 self.assertRaises(TypeError, encoder)
1481
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001482 def test_encoding_map_type_initialized(self):
1483 from encodings import cp1140
1484 # This used to crash, we are only verifying there's no crash.
1485 table_type = type(cp1140.encoding_table)
1486 self.assertEqual(table_type, table_type)
1487
Walter Dörwald3abcb012007-04-16 22:10:50 +00001488 def test_decoder_state(self):
1489 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001490 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001491 for encoding in all_unicode_encodings:
1492 if encoding not in broken_incremental_coders:
1493 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1494 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1495
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001496class CharmapTest(unittest.TestCase):
1497 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001498 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001499 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001500 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001501 )
1502
Ezio Melottib3aedd42010-11-20 19:04:17 +00001503 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001504 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001505 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001506 )
1507
Ezio Melottib3aedd42010-11-20 19:04:17 +00001508 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001509 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001510 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001511 )
1512
Ezio Melottib3aedd42010-11-20 19:04:17 +00001513 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001514 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001515 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001516 )
1517
Ezio Melottib3aedd42010-11-20 19:04:17 +00001518 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001519 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001520 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001521 )
1522
Guido van Rossum805365e2007-05-07 22:24:25 +00001523 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001524 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001525 codecs.charmap_decode(allbytes, "ignore", ""),
1526 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001527 )
1528
Thomas Wouters89f507f2006-12-13 04:49:30 +00001529class WithStmtTest(unittest.TestCase):
1530 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001531 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001532 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001533 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001534
1535 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001536 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001537 info = codecs.lookup("utf-8")
1538 with codecs.StreamReaderWriter(f, info.streamreader,
1539 info.streamwriter, 'strict') as srw:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001540 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001541
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001542class TypesTest(unittest.TestCase):
1543 def test_decode_unicode(self):
1544 # Most decoders don't accept unicode input
1545 decoders = [
1546 codecs.utf_7_decode,
1547 codecs.utf_8_decode,
1548 codecs.utf_16_le_decode,
1549 codecs.utf_16_be_decode,
1550 codecs.utf_16_ex_decode,
1551 codecs.utf_32_decode,
1552 codecs.utf_32_le_decode,
1553 codecs.utf_32_be_decode,
1554 codecs.utf_32_ex_decode,
1555 codecs.latin_1_decode,
1556 codecs.ascii_decode,
1557 codecs.charmap_decode,
1558 ]
1559 if hasattr(codecs, "mbcs_decode"):
1560 decoders.append(codecs.mbcs_decode)
1561 for decoder in decoders:
1562 self.assertRaises(TypeError, decoder, "xxx")
1563
1564 def test_unicode_escape(self):
1565 # Escape-decoding an unicode string is supported ang gives the same
1566 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001567 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1568 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1569 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1570 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001571
Martin v. Löwis43c57782009-05-10 08:15:24 +00001572class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001573
1574 def test_utf8(self):
1575 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001576 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001577 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001578 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001579 b"foo\x80bar")
1580 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001581 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001582 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001583 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001584 b"\xed\xb0\x80")
1585
1586 def test_ascii(self):
1587 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001588 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001589 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001590 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001591 b"foo\x80bar")
1592
1593 def test_charmap(self):
1594 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001595 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001596 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001597 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001598 b"foo\xa5bar")
1599
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001600 def test_latin1(self):
1601 # Issue6373
1602 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin1", "surrogateescape"),
1603 b"\xe4\xeb\xef\xf6\xfc")
1604
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001605
Victor Stinner3fed0872010-05-22 02:16:27 +00001606class BomTest(unittest.TestCase):
1607 def test_seek0(self):
1608 data = "1234567890"
1609 tests = ("utf-16",
1610 "utf-16-le",
1611 "utf-16-be",
1612 "utf-32",
1613 "utf-32-le",
1614 "utf-32-be")
1615 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001616 # Check if the BOM is written only once
1617 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001618 f.write(data)
1619 f.write(data)
1620 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001621 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001622 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001623 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001624
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001625 # Check that the BOM is written after a seek(0)
1626 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1627 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001628 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001629 f.seek(0)
1630 f.write(data)
1631 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001632 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001633
1634 # (StreamWriter) Check that the BOM is written after a seek(0)
1635 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1636 f.writer.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001637 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001638 f.writer.seek(0)
1639 f.writer.write(data)
1640 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001641 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001642
1643 # Check that the BOM is not written after a seek() at a position
1644 # different than the start
1645 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1646 f.write(data)
1647 f.seek(f.tell())
1648 f.write(data)
1649 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001650 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001651
1652 # (StreamWriter) Check that the BOM is not written after a seek()
1653 # at a position different than the start
1654 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1655 f.writer.write(data)
1656 f.writer.seek(f.writer.tell())
1657 f.writer.write(data)
1658 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001659 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001660
Victor Stinner3fed0872010-05-22 02:16:27 +00001661
Georg Brandl02524622010-12-02 18:06:51 +00001662bytes_transform_encodings = [
1663 "base64_codec",
1664 "uu_codec",
1665 "quopri_codec",
1666 "hex_codec",
1667]
1668try:
1669 import zlib
1670except ImportError:
1671 pass
1672else:
1673 bytes_transform_encodings.append("zlib_codec")
1674try:
1675 import bz2
1676except ImportError:
1677 pass
1678else:
1679 bytes_transform_encodings.append("bz2_codec")
1680
1681class TransformCodecTest(unittest.TestCase):
1682 def test_basics(self):
1683 binput = bytes(range(256))
1684 ainput = bytearray(binput)
1685 for encoding in bytes_transform_encodings:
1686 # generic codecs interface
1687 (o, size) = codecs.getencoder(encoding)(binput)
1688 self.assertEqual(size, len(binput))
1689 (i, size) = codecs.getdecoder(encoding)(o)
1690 self.assertEqual(size, len(o))
1691 self.assertEqual(i, binput)
1692
1693 # transform interface
1694 boutput = binput.transform(encoding)
1695 aoutput = ainput.transform(encoding)
1696 self.assertEqual(boutput, aoutput)
1697 self.assertIsInstance(boutput, bytes)
1698 self.assertIsInstance(aoutput, bytearray)
1699 bback = boutput.untransform(encoding)
1700 aback = aoutput.untransform(encoding)
1701 self.assertEqual(bback, aback)
1702 self.assertEqual(bback, binput)
1703 self.assertIsInstance(bback, bytes)
1704 self.assertIsInstance(aback, bytearray)
1705
1706 def test_read(self):
1707 for encoding in bytes_transform_encodings:
1708 sin = b"\x80".transform(encoding)
1709 reader = codecs.getreader(encoding)(io.BytesIO(sin))
1710 sout = reader.read()
1711 self.assertEqual(sout, b"\x80")
1712
1713 def test_readline(self):
1714 for encoding in bytes_transform_encodings:
1715 if encoding in ['uu_codec', 'zlib_codec']:
1716 continue
1717 sin = b"\x80".transform(encoding)
1718 reader = codecs.getreader(encoding)(io.BytesIO(sin))
1719 sout = reader.readline()
1720 self.assertEqual(sout, b"\x80")
1721
1722
Fred Drake2e2be372001-09-20 21:33:42 +00001723def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001724 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00001725 UTF32Test,
1726 UTF32LETest,
1727 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001728 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001729 UTF16LETest,
1730 UTF16BETest,
1731 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001732 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001733 UTF7Test,
1734 UTF16ExTest,
1735 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001736 RecodingTest,
1737 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001738 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001739 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001740 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001741 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001742 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001743 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001744 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001745 CharmapTest,
1746 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001747 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001748 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00001749 BomTest,
Georg Brandl02524622010-12-02 18:06:51 +00001750 TransformCodecTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001751 )
Fred Drake2e2be372001-09-20 21:33:42 +00001752
1753
1754if __name__ == "__main__":
1755 test_main()