blob: 8287a5b4ec536290c9c2c23f36b141fa8288828a [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Barry Warsaw04f357c2002-07-23 19:04:11 +00002import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00004import sys, _testcapi, io
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000010 def __init__(self, buffer):
11 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000012
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000019 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000020 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwald3abcb012007-04-16 22:10:50 +000026class MixInCheckStateHandling:
27 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000028 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000029 d = codecs.getincrementaldecoder(encoding)()
30 part1 = d.decode(s[:i])
31 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000032 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000033 # Check that the condition stated in the documentation for
34 # IncrementalDecoder.getstate() holds
35 if not state[1]:
36 # reset decoder to the default state without anything buffered
37 d.setstate((state[0][:0], 0))
38 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000039 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000040 # The decoder must return to the same state
41 self.assertEqual(state, d.getstate())
42 # Create a new decoder and set it to the state
43 # we extracted from the old one
44 d = codecs.getincrementaldecoder(encoding)()
45 d.setstate(state)
46 part2 = d.decode(s[i:], True)
47 self.assertEqual(u, part1+part2)
48
49 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000050 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000051 d = codecs.getincrementalencoder(encoding)()
52 part1 = d.encode(u[:i])
53 state = d.getstate()
54 d = codecs.getincrementalencoder(encoding)()
55 d.setstate(state)
56 part2 = d.encode(u[i:], True)
57 self.assertEqual(s, part1+part2)
58
59class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000060 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000061 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000062 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000063 # the StreamReader and check that the results equal the appropriate
64 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000065 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +000066 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000067 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000068 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000069 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000070 result += r.read()
71 self.assertEqual(result, partialresult)
72 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000073 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000074 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000075
Thomas Woutersa9773292006-04-21 09:43:23 +000076 # do the check again, this time using a incremental decoder
77 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000078 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000079 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000080 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000081 self.assertEqual(result, partialresult)
82 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000083 self.assertEqual(d.decode(b"", True), "")
84 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000085
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000086 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000087 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000088 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000089 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000090 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000091 self.assertEqual(result, partialresult)
92 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000093 self.assertEqual(d.decode(b"", True), "")
94 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000095
96 # check iterdecode()
97 encoded = input.encode(self.encoding)
98 self.assertEqual(
99 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000100 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000101 )
102
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000103 def test_readline(self):
104 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000105 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000106 return codecs.getreader(self.encoding)(stream)
107
Walter Dörwaldca199432006-03-06 22:39:12 +0000108 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000109 reader = getreader(input)
110 lines = []
111 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000112 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000113 if not line:
114 break
115 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000116 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000117
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000118 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
119 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
120 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000121 self.assertEqual(readalllines(s, True), sexpected)
122 self.assertEqual(readalllines(s, False), sexpectednoends)
123 self.assertEqual(readalllines(s, True, 10), sexpected)
124 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000125
126 # Test long lines (multiple calls to read() in readline())
127 vw = []
128 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000129 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
130 vw.append((i*200)*"\3042" + lineend)
131 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000132 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
133 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
134
135 # Test lines where the first read might end with \r, so the
136 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000137 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000138 for lineend in "\n \r\n \r \u2028".split():
139 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000140 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000141 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000142 self.assertEqual(
143 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000144 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000145 )
146 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000147 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000148 self.assertEqual(
149 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000150 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000151 )
152
153 def test_bug1175396(self):
154 s = [
155 '<%!--===================================================\r\n',
156 ' BLOG index page: show recent articles,\r\n',
157 ' today\'s articles, or articles of a specific date.\r\n',
158 '========================================================--%>\r\n',
159 '<%@inputencoding="ISO-8859-1"%>\r\n',
160 '<%@pagetemplate=TEMPLATE.y%>\r\n',
161 '<%@import=import frog.util, frog%>\r\n',
162 '<%@import=import frog.objects%>\r\n',
163 '<%@import=from frog.storageerrors import StorageError%>\r\n',
164 '<%\r\n',
165 '\r\n',
166 'import logging\r\n',
167 'log=logging.getLogger("Snakelets.logger")\r\n',
168 '\r\n',
169 '\r\n',
170 'user=self.SessionCtx.user\r\n',
171 'storageEngine=self.SessionCtx.storageEngine\r\n',
172 '\r\n',
173 '\r\n',
174 'def readArticlesFromDate(date, count=None):\r\n',
175 ' entryids=storageEngine.listBlogEntries(date)\r\n',
176 ' entryids.reverse() # descending\r\n',
177 ' if count:\r\n',
178 ' entryids=entryids[:count]\r\n',
179 ' try:\r\n',
180 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
181 ' except StorageError,x:\r\n',
182 ' log.error("Error loading articles: "+str(x))\r\n',
183 ' self.abort("cannot load articles")\r\n',
184 '\r\n',
185 'showdate=None\r\n',
186 '\r\n',
187 'arg=self.Request.getArg()\r\n',
188 'if arg=="today":\r\n',
189 ' #-------------------- TODAY\'S ARTICLES\r\n',
190 ' self.write("<h2>Today\'s articles</h2>")\r\n',
191 ' showdate = frog.util.isodatestr() \r\n',
192 ' entries = readArticlesFromDate(showdate)\r\n',
193 'elif arg=="active":\r\n',
194 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
195 ' self.Yredirect("active.y")\r\n',
196 'elif arg=="login":\r\n',
197 ' #-------------------- LOGIN PAGE redirect\r\n',
198 ' self.Yredirect("login.y")\r\n',
199 'elif arg=="date":\r\n',
200 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
201 ' showdate = self.Request.getParameter("date")\r\n',
202 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
203 ' entries = readArticlesFromDate(showdate)\r\n',
204 'else:\r\n',
205 ' #-------------------- RECENT ARTICLES\r\n',
206 ' self.write("<h2>Recent articles</h2>")\r\n',
207 ' dates=storageEngine.listBlogEntryDates()\r\n',
208 ' if dates:\r\n',
209 ' entries=[]\r\n',
210 ' SHOWAMOUNT=10\r\n',
211 ' for showdate in dates:\r\n',
212 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
213 ' if len(entries)>=SHOWAMOUNT:\r\n',
214 ' break\r\n',
215 ' \r\n',
216 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000217 stream = io.BytesIO("".join(s).encode(self.encoding))
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000218 reader = codecs.getreader(self.encoding)(stream)
219 for (i, line) in enumerate(reader):
220 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000221
222 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000223 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000224 writer = codecs.getwriter(self.encoding)(q)
225 reader = codecs.getreader(self.encoding)(q)
226
227 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000228 writer.write("foo\r")
229 self.assertEqual(reader.readline(keepends=False), "foo")
230 writer.write("\nbar\r")
231 self.assertEqual(reader.readline(keepends=False), "")
232 self.assertEqual(reader.readline(keepends=False), "bar")
233 writer.write("baz")
234 self.assertEqual(reader.readline(keepends=False), "baz")
235 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000236
237 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000238 writer.write("foo\r")
239 self.assertEqual(reader.readline(keepends=True), "foo\r")
240 writer.write("\nbar\r")
241 self.assertEqual(reader.readline(keepends=True), "\n")
242 self.assertEqual(reader.readline(keepends=True), "bar\r")
243 writer.write("baz")
244 self.assertEqual(reader.readline(keepends=True), "baz")
245 self.assertEqual(reader.readline(keepends=True), "")
246 writer.write("foo\r\n")
247 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000248
Walter Dörwald9fa09462005-01-10 12:01:39 +0000249 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000250 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
251 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
252 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000253
254 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000255 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000256 reader = codecs.getreader(self.encoding)(stream)
257 self.assertEqual(reader.readline(), s1)
258 self.assertEqual(reader.readline(), s2)
259 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000260 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000261
262 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000263 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
264 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
265 s3 = "stillokay:bbbbxx\r\n"
266 s4 = "broken!!!!badbad\r\n"
267 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000268
269 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000270 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000271 reader = codecs.getreader(self.encoding)(stream)
272 self.assertEqual(reader.readline(), s1)
273 self.assertEqual(reader.readline(), s2)
274 self.assertEqual(reader.readline(), s3)
275 self.assertEqual(reader.readline(), s4)
276 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000277 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000278
Walter Dörwald41980ca2007-08-16 21:55:45 +0000279class UTF32Test(ReadTest):
280 encoding = "utf-32"
281
282 spamle = (b'\xff\xfe\x00\x00'
283 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
284 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
285 spambe = (b'\x00\x00\xfe\xff'
286 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
287 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
288
289 def test_only_one_bom(self):
290 _,_,reader,writer = codecs.lookup(self.encoding)
291 # encode some stream
292 s = io.BytesIO()
293 f = writer(s)
294 f.write("spam")
295 f.write("spam")
296 d = s.getvalue()
297 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000298 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000299 # try to read it back
300 s = io.BytesIO(d)
301 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000302 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000303
304 def test_badbom(self):
305 s = io.BytesIO(4*b"\xff")
306 f = codecs.getreader(self.encoding)(s)
307 self.assertRaises(UnicodeError, f.read)
308
309 s = io.BytesIO(8*b"\xff")
310 f = codecs.getreader(self.encoding)(s)
311 self.assertRaises(UnicodeError, f.read)
312
313 def test_partial(self):
314 self.check_partial(
315 "\x00\xff\u0100\uffff",
316 [
317 "", # first byte of BOM read
318 "", # second byte of BOM read
319 "", # third byte of BOM read
320 "", # fourth byte of BOM read => byteorder known
321 "",
322 "",
323 "",
324 "\x00",
325 "\x00",
326 "\x00",
327 "\x00",
328 "\x00\xff",
329 "\x00\xff",
330 "\x00\xff",
331 "\x00\xff",
332 "\x00\xff\u0100",
333 "\x00\xff\u0100",
334 "\x00\xff\u0100",
335 "\x00\xff\u0100",
336 "\x00\xff\u0100\uffff",
337 ]
338 )
339
Georg Brandl791f4e12009-09-17 11:41:24 +0000340 def test_handlers(self):
341 self.assertEqual(('\ufffd', 1),
342 codecs.utf_32_decode(b'\x01', 'replace', True))
343 self.assertEqual(('', 1),
344 codecs.utf_32_decode(b'\x01', 'ignore', True))
345
Walter Dörwald41980ca2007-08-16 21:55:45 +0000346 def test_errors(self):
347 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
348 b"\xff", "strict", True)
349
350 def test_decoder_state(self):
351 self.check_state_handling_decode(self.encoding,
352 "spamspam", self.spamle)
353 self.check_state_handling_decode(self.encoding,
354 "spamspam", self.spambe)
355
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000356 def test_issue8941(self):
357 # Issue #8941: insufficient result allocation when decoding into
358 # surrogate pairs on UCS-2 builds.
359 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
360 self.assertEqual('\U00010000' * 1024,
361 codecs.utf_32_decode(encoded_le)[0])
362 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
363 self.assertEqual('\U00010000' * 1024,
364 codecs.utf_32_decode(encoded_be)[0])
365
Walter Dörwald41980ca2007-08-16 21:55:45 +0000366class UTF32LETest(ReadTest):
367 encoding = "utf-32-le"
368
369 def test_partial(self):
370 self.check_partial(
371 "\x00\xff\u0100\uffff",
372 [
373 "",
374 "",
375 "",
376 "\x00",
377 "\x00",
378 "\x00",
379 "\x00",
380 "\x00\xff",
381 "\x00\xff",
382 "\x00\xff",
383 "\x00\xff",
384 "\x00\xff\u0100",
385 "\x00\xff\u0100",
386 "\x00\xff\u0100",
387 "\x00\xff\u0100",
388 "\x00\xff\u0100\uffff",
389 ]
390 )
391
392 def test_simple(self):
393 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
394
395 def test_errors(self):
396 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
397 b"\xff", "strict", True)
398
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000399 def test_issue8941(self):
400 # Issue #8941: insufficient result allocation when decoding into
401 # surrogate pairs on UCS-2 builds.
402 encoded = b'\x00\x00\x01\x00' * 1024
403 self.assertEqual('\U00010000' * 1024,
404 codecs.utf_32_le_decode(encoded)[0])
405
Walter Dörwald41980ca2007-08-16 21:55:45 +0000406class UTF32BETest(ReadTest):
407 encoding = "utf-32-be"
408
409 def test_partial(self):
410 self.check_partial(
411 "\x00\xff\u0100\uffff",
412 [
413 "",
414 "",
415 "",
416 "\x00",
417 "\x00",
418 "\x00",
419 "\x00",
420 "\x00\xff",
421 "\x00\xff",
422 "\x00\xff",
423 "\x00\xff",
424 "\x00\xff\u0100",
425 "\x00\xff\u0100",
426 "\x00\xff\u0100",
427 "\x00\xff\u0100",
428 "\x00\xff\u0100\uffff",
429 ]
430 )
431
432 def test_simple(self):
433 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
434
435 def test_errors(self):
436 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
437 b"\xff", "strict", True)
438
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000439 def test_issue8941(self):
440 # Issue #8941: insufficient result allocation when decoding into
441 # surrogate pairs on UCS-2 builds.
442 encoded = b'\x00\x01\x00\x00' * 1024
443 self.assertEqual('\U00010000' * 1024,
444 codecs.utf_32_be_decode(encoded)[0])
445
446
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000447class UTF16Test(ReadTest):
448 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000449
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000450 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
451 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000452
453 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000454 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000455 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000456 s = io.BytesIO()
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000457 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000458 f.write("spam")
459 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000460 d = s.getvalue()
461 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000462 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000463 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000464 s = io.BytesIO(d)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000465 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000466 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000467
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000468 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000469 s = io.BytesIO(b"\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000470 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000471 self.assertRaises(UnicodeError, f.read)
472
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000473 s = io.BytesIO(b"\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000474 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000475 self.assertRaises(UnicodeError, f.read)
476
Walter Dörwald69652032004-09-07 20:24:22 +0000477 def test_partial(self):
478 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000479 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000480 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000481 "", # first byte of BOM read
482 "", # second byte of BOM read => byteorder known
483 "",
484 "\x00",
485 "\x00",
486 "\x00\xff",
487 "\x00\xff",
488 "\x00\xff\u0100",
489 "\x00\xff\u0100",
490 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000491 ]
492 )
493
Georg Brandl791f4e12009-09-17 11:41:24 +0000494 def test_handlers(self):
495 self.assertEqual(('\ufffd', 1),
496 codecs.utf_16_decode(b'\x01', 'replace', True))
497 self.assertEqual(('', 1),
498 codecs.utf_16_decode(b'\x01', 'ignore', True))
499
Walter Dörwalde22d3392005-11-17 08:52:34 +0000500 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000501 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000502 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000503
504 def test_decoder_state(self):
505 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000506 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000507 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000508 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000509
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000510 def test_bug691291(self):
511 # Files are always opened in binary mode, even if no binary mode was
512 # specified. This means that no automatic conversion of '\n' is done
513 # on reading and writing.
514 s1 = 'Hello\r\nworld\r\n'
515
516 s = s1.encode(self.encoding)
517 try:
518 with open(support.TESTFN, 'wb') as fp:
519 fp.write(s)
520 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
521 self.assertEqual(reader.read(), s1)
522 finally:
523 support.unlink(support.TESTFN)
524
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000525class UTF16LETest(ReadTest):
526 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000527
528 def test_partial(self):
529 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000530 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000531 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000532 "",
533 "\x00",
534 "\x00",
535 "\x00\xff",
536 "\x00\xff",
537 "\x00\xff\u0100",
538 "\x00\xff\u0100",
539 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000540 ]
541 )
542
Walter Dörwalde22d3392005-11-17 08:52:34 +0000543 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000544 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000545 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000546
Victor Stinner53a9dd72010-12-08 22:25:45 +0000547 def test_nonbmp(self):
548 self.assertEqual("\U00010203".encode(self.encoding),
549 b'\x00\xd8\x03\xde')
550 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
551 "\U00010203")
552
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000553class UTF16BETest(ReadTest):
554 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000555
556 def test_partial(self):
557 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000558 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000559 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000560 "",
561 "\x00",
562 "\x00",
563 "\x00\xff",
564 "\x00\xff",
565 "\x00\xff\u0100",
566 "\x00\xff\u0100",
567 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000568 ]
569 )
570
Walter Dörwalde22d3392005-11-17 08:52:34 +0000571 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000572 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000573 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000574
Victor Stinner53a9dd72010-12-08 22:25:45 +0000575 def test_nonbmp(self):
576 self.assertEqual("\U00010203".encode(self.encoding),
577 b'\xd8\x00\xde\x03')
578 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
579 "\U00010203")
580
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000581class UTF8Test(ReadTest):
582 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000583
584 def test_partial(self):
585 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000586 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000587 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000588 "\x00",
589 "\x00",
590 "\x00\xff",
591 "\x00\xff",
592 "\x00\xff\u07ff",
593 "\x00\xff\u07ff",
594 "\x00\xff\u07ff",
595 "\x00\xff\u07ff\u0800",
596 "\x00\xff\u07ff\u0800",
597 "\x00\xff\u07ff\u0800",
598 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000599 ]
600 )
601
Walter Dörwald3abcb012007-04-16 22:10:50 +0000602 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000603 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000604 self.check_state_handling_decode(self.encoding,
605 u, u.encode(self.encoding))
606
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000607 def test_lone_surrogates(self):
608 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
609 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000610 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
611 b'[\\udc80]')
612 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
613 b'[&#56448;]')
614 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
615 b'[\x80]')
616 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
617 b'[]')
618 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
619 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000620
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000621 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000622 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
623 b"abc\xed\xa0\x80def")
624 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
625 "abc\ud800def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000626 self.assertTrue(codecs.lookup_error("surrogatepass"))
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000627
Walter Dörwalde22d3392005-11-17 08:52:34 +0000628class UTF7Test(ReadTest):
629 encoding = "utf-7"
630
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000631 def test_partial(self):
632 self.check_partial(
633 "a+-b",
634 [
635 "a",
636 "a",
637 "a+",
638 "a+-",
639 "a+-b",
640 ]
641 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000642
643class UTF16ExTest(unittest.TestCase):
644
645 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000646 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000647
648 def test_bad_args(self):
649 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
650
651class ReadBufferTest(unittest.TestCase):
652
653 def test_array(self):
654 import array
655 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000656 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000657 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000658 )
659
660 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000661 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000662
663 def test_bad_args(self):
664 self.assertRaises(TypeError, codecs.readbuffer_encode)
665 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
666
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000667class UTF8SigTest(ReadTest):
668 encoding = "utf-8-sig"
669
670 def test_partial(self):
671 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000672 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000673 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000674 "",
675 "",
676 "", # First BOM has been read and skipped
677 "",
678 "",
679 "\ufeff", # Second BOM has been read and emitted
680 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000681 "\ufeff\x00", # First byte of encoded "\xff" read
682 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
683 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
684 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000685 "\ufeff\x00\xff\u07ff",
686 "\ufeff\x00\xff\u07ff",
687 "\ufeff\x00\xff\u07ff\u0800",
688 "\ufeff\x00\xff\u07ff\u0800",
689 "\ufeff\x00\xff\u07ff\u0800",
690 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000691 ]
692 )
693
Thomas Wouters89f507f2006-12-13 04:49:30 +0000694 def test_bug1601501(self):
695 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000696 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000697
Walter Dörwald3abcb012007-04-16 22:10:50 +0000698 def test_bom(self):
699 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000700 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000701 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
702
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000703 def test_stream_bom(self):
704 unistring = "ABC\u00A1\u2200XYZ"
705 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
706
707 reader = codecs.getreader("utf-8-sig")
708 for sizehint in [None] + list(range(1, 11)) + \
709 [64, 128, 256, 512, 1024]:
710 istream = reader(io.BytesIO(bytestring))
711 ostream = io.StringIO()
712 while 1:
713 if sizehint is not None:
714 data = istream.read(sizehint)
715 else:
716 data = istream.read()
717
718 if not data:
719 break
720 ostream.write(data)
721
722 got = ostream.getvalue()
723 self.assertEqual(got, unistring)
724
725 def test_stream_bare(self):
726 unistring = "ABC\u00A1\u2200XYZ"
727 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
728
729 reader = codecs.getreader("utf-8-sig")
730 for sizehint in [None] + list(range(1, 11)) + \
731 [64, 128, 256, 512, 1024]:
732 istream = reader(io.BytesIO(bytestring))
733 ostream = io.StringIO()
734 while 1:
735 if sizehint is not None:
736 data = istream.read(sizehint)
737 else:
738 data = istream.read()
739
740 if not data:
741 break
742 ostream.write(data)
743
744 got = ostream.getvalue()
745 self.assertEqual(got, unistring)
746
747class EscapeDecodeTest(unittest.TestCase):
748 def test_empty(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000749 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000750
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000751class RecodingTest(unittest.TestCase):
752 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000753 f = io.BytesIO()
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000754 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000755 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000756 f2.close()
757 # Python used to crash on this at exit because of a refcount
758 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000759
Martin v. Löwis2548c732003-04-18 10:39:54 +0000760# From RFC 3492
761punycode_testcases = [
762 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000763 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
764 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000765 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000766 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000767 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000768 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000769 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000770 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000771 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000772 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000773 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
774 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
775 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000776 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000777 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000778 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
779 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
780 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000781 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000782 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000783 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000784 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
785 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
786 "\u0939\u0948\u0902",
787 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000788
789 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000790 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000791 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
792 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000793
794 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000795 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
796 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
797 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000798 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
799 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000800
801 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000802 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
803 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
804 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
805 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000806 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000807
808 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000809 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
810 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
811 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
812 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
813 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000814 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000815
816 # (K) Vietnamese:
817 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
818 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000819 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
820 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
821 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
822 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000823 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000824
Martin v. Löwis2548c732003-04-18 10:39:54 +0000825 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000826 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000827 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000828
Martin v. Löwis2548c732003-04-18 10:39:54 +0000829 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000830 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
831 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
832 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000833 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000834
835 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000836 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
837 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
838 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000839 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000840
841 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000842 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000843 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000844
845 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000846 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
847 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000848 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000849
850 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000851 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000852 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000853
854 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000855 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000856 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000857
858 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000859 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
860 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000861 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000862 ]
863
864for i in punycode_testcases:
865 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000866 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000867
868class PunycodeTest(unittest.TestCase):
869 def test_encode(self):
870 for uni, puny in punycode_testcases:
871 # Need to convert both strings to lower case, since
872 # some of the extended encodings use upper case, but our
873 # code produces only lower case. Converting just puny to
874 # lower is also insufficient, since some of the input characters
875 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +0000876 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +0000877 str(uni.encode("punycode"), "ascii").lower(),
878 str(puny, "ascii").lower()
879 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000880
881 def test_decode(self):
882 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000883 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000884 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000885 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000886
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000887class UnicodeInternalTest(unittest.TestCase):
888 def test_bug1251300(self):
889 # Decoding with unicode_internal used to not correctly handle "code
890 # points" above 0x10ffff on UCS-4 builds.
891 if sys.maxunicode > 0xffff:
892 ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000893 (b"\x00\x10\xff\xff", "\U0010ffff"),
894 (b"\x00\x00\x01\x01", "\U00000101"),
895 (b"", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000896 ]
897 not_ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000898 b"\x7f\xff\xff\xff",
899 b"\x80\x00\x00\x00",
900 b"\x81\x00\x00\x00",
901 b"\x00",
902 b"\x00\x00\x00\x00\x00",
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000903 ]
904 for internal, uni in ok:
905 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000906 internal = bytes(reversed(internal))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000907 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000908 for internal in not_ok:
909 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000910 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000911 self.assertRaises(UnicodeDecodeError, internal.decode,
912 "unicode_internal")
913
914 def test_decode_error_attributes(self):
915 if sys.maxunicode > 0xffff:
916 try:
Walter Dörwald092a2252007-06-07 11:26:16 +0000917 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +0000918 except UnicodeDecodeError as ex:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000919 self.assertEqual("unicode_internal", ex.encoding)
920 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
921 self.assertEqual(4, ex.start)
922 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000923 else:
924 self.fail()
925
926 def test_decode_callback(self):
927 if sys.maxunicode > 0xffff:
928 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
929 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossum98297ee2007-11-06 21:34:58 +0000930 ab = "ab".encode("unicode_internal").decode()
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000931 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
932 "ascii"),
933 "UnicodeInternalTest")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000934 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000935
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000936 def test_encode_length(self):
937 # Issue 3739
938 encoder = codecs.getencoder("unicode_internal")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000939 self.assertEqual(encoder("a")[1], 1)
940 self.assertEqual(encoder("\xe9\u0142")[1], 2)
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000941
Ezio Melottib3aedd42010-11-20 19:04:17 +0000942 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +0000943
Martin v. Löwis2548c732003-04-18 10:39:54 +0000944# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
945nameprep_tests = [
946 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000947 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
948 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
949 b'\xb8\x8f\xef\xbb\xbf',
950 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000951 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000952 (b'CAFE',
953 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000954 # 3.3 Case folding 8bit U+00DF (german sharp s).
955 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000956 (b'\xc3\x9f',
957 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000958 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000959 (b'\xc4\xb0',
960 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000961 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000962 (b'\xc5\x83\xcd\xba',
963 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000964 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
965 # XXX: skip this as it fails in UCS-2 mode
966 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
967 # 'telc\xe2\x88\x95kg\xcf\x83'),
968 (None, None),
969 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000970 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
971 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000972 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000973 (b'\xe1\xbe\xb7',
974 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000975 # 3.9 Self-reverting case folding U+01F0 and normalization.
976 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000977 (b'\xc7\xb0',
978 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000979 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000980 (b'\xce\x90',
981 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000982 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000983 (b'\xce\xb0',
984 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000985 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000986 (b'\xe1\xba\x96',
987 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000988 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000989 (b'\xe1\xbd\x96',
990 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000991 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000992 (b' ',
993 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000994 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000995 (b'\xc2\xa0',
996 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000997 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000998 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000999 None),
1000 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001001 (b'\xe2\x80\x80',
1002 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001003 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001004 (b'\xe2\x80\x8b',
1005 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001006 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001007 (b'\xe3\x80\x80',
1008 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001009 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001010 (b'\x10\x7f',
1011 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001012 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001013 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001014 None),
1015 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001016 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001017 None),
1018 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001019 (b'\xef\xbb\xbf',
1020 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001021 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001022 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001023 None),
1024 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001025 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001026 None),
1027 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001028 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001029 None),
1030 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001031 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001032 None),
1033 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001034 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001035 None),
1036 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001037 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001038 None),
1039 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001040 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001041 None),
1042 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001043 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001044 None),
1045 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001046 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001047 None),
1048 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001049 (b'\xcd\x81',
1050 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001051 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001052 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001053 None),
1054 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001055 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001056 None),
1057 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001058 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001059 None),
1060 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001061 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001062 None),
1063 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001064 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001065 None),
1066 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001067 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001068 None),
1069 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001070 (b'foo\xef\xb9\xb6bar',
1071 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001072 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001073 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001074 None),
1075 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001076 (b'\xd8\xa71\xd8\xa8',
1077 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001078 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001079 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001080 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001081 # None),
1082 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001083 # 3.44 Larger test (shrinking).
1084 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001085 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1086 b'\xaa\xce\xb0\xe2\x80\x80',
1087 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001088 # 3.45 Larger test (expanding).
1089 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001090 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1091 b'\x80',
1092 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1093 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1094 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001095 ]
1096
1097
1098class NameprepTest(unittest.TestCase):
1099 def test_nameprep(self):
1100 from encodings.idna import nameprep
1101 for pos, (orig, prepped) in enumerate(nameprep_tests):
1102 if orig is None:
1103 # Skipped
1104 continue
1105 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001106 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001107 if prepped is None:
1108 # Input contains prohibited characters
1109 self.assertRaises(UnicodeError, nameprep, orig)
1110 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001111 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001112 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001113 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001114 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001115 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001116
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001117class IDNACodecTest(unittest.TestCase):
1118 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001119 self.assertEqual(str(b"python.org", "idna"), "python.org")
1120 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1121 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1122 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001123
1124 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001125 self.assertEqual("python.org".encode("idna"), b"python.org")
1126 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1127 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1128 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001129
Martin v. Löwis8b595142005-08-25 11:03:38 +00001130 def test_stream(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001131 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001132 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001133 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001134
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001135 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001136 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001137 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001138 "python.org"
1139 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001140 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001141 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001142 "python.org."
1143 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001144 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001145 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001146 "pyth\xf6n.org."
1147 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001148 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001149 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001150 "pyth\xf6n.org."
1151 )
1152
1153 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001154 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1155 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1156 self.assertEqual(decoder.decode(b"rg"), "")
1157 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001158
1159 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001160 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1161 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1162 self.assertEqual(decoder.decode(b"rg."), "org.")
1163 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001164
1165 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001166 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001167 b"".join(codecs.iterencode("python.org", "idna")),
1168 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001169 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001170 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001171 b"".join(codecs.iterencode("python.org.", "idna")),
1172 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001173 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001174 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001175 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1176 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001177 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001178 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001179 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1180 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001181 )
1182
1183 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001184 self.assertEqual(encoder.encode("\xe4x"), b"")
1185 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1186 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001187
1188 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001189 self.assertEqual(encoder.encode("\xe4x"), b"")
1190 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1191 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001192
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001193class CodecsModuleTest(unittest.TestCase):
1194
1195 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001196 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1197 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001198 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001199 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001200 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001201
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001202 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001203 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1204 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001205 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001206 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001207 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001208 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001209
1210 def test_register(self):
1211 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001212 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001213
1214 def test_lookup(self):
1215 self.assertRaises(TypeError, codecs.lookup)
1216 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001217 self.assertRaises(LookupError, codecs.lookup, " ")
1218
1219 def test_getencoder(self):
1220 self.assertRaises(TypeError, codecs.getencoder)
1221 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1222
1223 def test_getdecoder(self):
1224 self.assertRaises(TypeError, codecs.getdecoder)
1225 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1226
1227 def test_getreader(self):
1228 self.assertRaises(TypeError, codecs.getreader)
1229 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1230
1231 def test_getwriter(self):
1232 self.assertRaises(TypeError, codecs.getwriter)
1233 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001234
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001235class StreamReaderTest(unittest.TestCase):
1236
1237 def setUp(self):
1238 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001239 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001240
1241 def test_readlines(self):
1242 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001243 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001244
Thomas Wouters89f507f2006-12-13 04:49:30 +00001245class EncodedFileTest(unittest.TestCase):
1246
1247 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001248 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001249 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001250 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001251
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001252 f = io.BytesIO()
Thomas Wouters89f507f2006-12-13 04:49:30 +00001253 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001254 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001255 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001256
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001257all_unicode_encodings = [
1258 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001259 "big5",
1260 "big5hkscs",
1261 "charmap",
1262 "cp037",
1263 "cp1006",
1264 "cp1026",
1265 "cp1140",
1266 "cp1250",
1267 "cp1251",
1268 "cp1252",
1269 "cp1253",
1270 "cp1254",
1271 "cp1255",
1272 "cp1256",
1273 "cp1257",
1274 "cp1258",
1275 "cp424",
1276 "cp437",
1277 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001278 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001279 "cp737",
1280 "cp775",
1281 "cp850",
1282 "cp852",
1283 "cp855",
1284 "cp856",
1285 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001286 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001287 "cp860",
1288 "cp861",
1289 "cp862",
1290 "cp863",
1291 "cp864",
1292 "cp865",
1293 "cp866",
1294 "cp869",
1295 "cp874",
1296 "cp875",
1297 "cp932",
1298 "cp949",
1299 "cp950",
1300 "euc_jis_2004",
1301 "euc_jisx0213",
1302 "euc_jp",
1303 "euc_kr",
1304 "gb18030",
1305 "gb2312",
1306 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001307 "hp_roman8",
1308 "hz",
1309 "idna",
1310 "iso2022_jp",
1311 "iso2022_jp_1",
1312 "iso2022_jp_2",
1313 "iso2022_jp_2004",
1314 "iso2022_jp_3",
1315 "iso2022_jp_ext",
1316 "iso2022_kr",
1317 "iso8859_1",
1318 "iso8859_10",
1319 "iso8859_11",
1320 "iso8859_13",
1321 "iso8859_14",
1322 "iso8859_15",
1323 "iso8859_16",
1324 "iso8859_2",
1325 "iso8859_3",
1326 "iso8859_4",
1327 "iso8859_5",
1328 "iso8859_6",
1329 "iso8859_7",
1330 "iso8859_8",
1331 "iso8859_9",
1332 "johab",
1333 "koi8_r",
1334 "koi8_u",
1335 "latin_1",
1336 "mac_cyrillic",
1337 "mac_greek",
1338 "mac_iceland",
1339 "mac_latin2",
1340 "mac_roman",
1341 "mac_turkish",
1342 "palmos",
1343 "ptcp154",
1344 "punycode",
1345 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001346 "shift_jis",
1347 "shift_jis_2004",
1348 "shift_jisx0213",
1349 "tis_620",
1350 "unicode_escape",
1351 "unicode_internal",
1352 "utf_16",
1353 "utf_16_be",
1354 "utf_16_le",
1355 "utf_7",
1356 "utf_8",
1357]
1358
1359if hasattr(codecs, "mbcs_encode"):
1360 all_unicode_encodings.append("mbcs")
1361
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001362# The following encoding is not tested, because it's not supposed
1363# to work:
1364# "undefined"
1365
1366# The following encodings don't work in stateful mode
1367broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001368 "punycode",
1369 "unicode_internal"
1370]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001371broken_incremental_coders = broken_unicode_with_streams + [
1372 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001373]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001374
Walter Dörwald3abcb012007-04-16 22:10:50 +00001375class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001376 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001377 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001378 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001379 name = codecs.lookup(encoding).name
1380 if encoding.endswith("_codec"):
1381 name += "_codec"
1382 elif encoding == "latin_1":
1383 name = "latin_1"
1384 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001385 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001386 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001387 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001388 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1389
1390 if encoding not in broken_unicode_with_streams:
1391 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001392 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001393 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001394 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001395 for c in s:
1396 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001397 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001398 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001399 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001400 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001401 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001402 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001403 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001404 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001405 decodedresult += reader.read()
1406 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1407
Thomas Wouters89f507f2006-12-13 04:49:30 +00001408 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001409 # check incremental decoder/encoder (fetched via the Python
1410 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001411 try:
1412 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001413 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001414 except LookupError: # no IncrementalEncoder
1415 pass
1416 else:
1417 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001418 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001419 for c in s:
1420 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001421 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001422 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001423 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001424 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001425 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001426 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001427 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1428
1429 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001430 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001431 for c in s:
1432 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001433 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001434 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001435 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001436 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001437 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001438 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001439 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1440
1441 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001442 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001443 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1444
1445 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001446 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1447 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001448
Victor Stinner554f3f02010-06-16 23:33:54 +00001449 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001450 # check incremental decoder/encoder with errors argument
1451 try:
1452 encoder = codecs.getincrementalencoder(encoding)("ignore")
1453 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1454 except LookupError: # no IncrementalEncoder
1455 pass
1456 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001457 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001458 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001459 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001460 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1461
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001462 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001463 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001464 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001465 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1466
Walter Dörwald729c31f2005-03-14 19:06:30 +00001467 def test_seek(self):
1468 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001469 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001470 for encoding in all_unicode_encodings:
1471 if encoding == "idna": # FIXME: See SF bug #1163178
1472 continue
1473 if encoding in broken_unicode_with_streams:
1474 continue
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001475 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001476 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001477 # Test that calling seek resets the internal codec state and buffers
1478 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001479 data = reader.read()
1480 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001481
Walter Dörwalde22d3392005-11-17 08:52:34 +00001482 def test_bad_decode_args(self):
1483 for encoding in all_unicode_encodings:
1484 decoder = codecs.getdecoder(encoding)
1485 self.assertRaises(TypeError, decoder)
1486 if encoding not in ("idna", "punycode"):
1487 self.assertRaises(TypeError, decoder, 42)
1488
1489 def test_bad_encode_args(self):
1490 for encoding in all_unicode_encodings:
1491 encoder = codecs.getencoder(encoding)
1492 self.assertRaises(TypeError, encoder)
1493
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001494 def test_encoding_map_type_initialized(self):
1495 from encodings import cp1140
1496 # This used to crash, we are only verifying there's no crash.
1497 table_type = type(cp1140.encoding_table)
1498 self.assertEqual(table_type, table_type)
1499
Walter Dörwald3abcb012007-04-16 22:10:50 +00001500 def test_decoder_state(self):
1501 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001502 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001503 for encoding in all_unicode_encodings:
1504 if encoding not in broken_incremental_coders:
1505 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1506 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1507
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001508class CharmapTest(unittest.TestCase):
1509 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001510 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001511 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001512 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001513 )
1514
Ezio Melottib3aedd42010-11-20 19:04:17 +00001515 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001516 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001517 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001518 )
1519
Ezio Melottib3aedd42010-11-20 19:04:17 +00001520 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001521 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001522 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001523 )
1524
Ezio Melottib3aedd42010-11-20 19:04:17 +00001525 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001526 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001527 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001528 )
1529
Ezio Melottib3aedd42010-11-20 19:04:17 +00001530 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001531 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001532 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001533 )
1534
Guido van Rossum805365e2007-05-07 22:24:25 +00001535 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001536 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001537 codecs.charmap_decode(allbytes, "ignore", ""),
1538 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001539 )
1540
Thomas Wouters89f507f2006-12-13 04:49:30 +00001541class WithStmtTest(unittest.TestCase):
1542 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001543 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001544 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001545 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001546
1547 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001548 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001549 info = codecs.lookup("utf-8")
1550 with codecs.StreamReaderWriter(f, info.streamreader,
1551 info.streamwriter, 'strict') as srw:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001552 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001553
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001554class TypesTest(unittest.TestCase):
1555 def test_decode_unicode(self):
1556 # Most decoders don't accept unicode input
1557 decoders = [
1558 codecs.utf_7_decode,
1559 codecs.utf_8_decode,
1560 codecs.utf_16_le_decode,
1561 codecs.utf_16_be_decode,
1562 codecs.utf_16_ex_decode,
1563 codecs.utf_32_decode,
1564 codecs.utf_32_le_decode,
1565 codecs.utf_32_be_decode,
1566 codecs.utf_32_ex_decode,
1567 codecs.latin_1_decode,
1568 codecs.ascii_decode,
1569 codecs.charmap_decode,
1570 ]
1571 if hasattr(codecs, "mbcs_decode"):
1572 decoders.append(codecs.mbcs_decode)
1573 for decoder in decoders:
1574 self.assertRaises(TypeError, decoder, "xxx")
1575
1576 def test_unicode_escape(self):
1577 # Escape-decoding an unicode string is supported ang gives the same
1578 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001579 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1580 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1581 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1582 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001583
Martin v. Löwis43c57782009-05-10 08:15:24 +00001584class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001585
1586 def test_utf8(self):
1587 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001588 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001589 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001590 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001591 b"foo\x80bar")
1592 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001593 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001594 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001595 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001596 b"\xed\xb0\x80")
1597
1598 def test_ascii(self):
1599 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001600 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001601 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001602 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001603 b"foo\x80bar")
1604
1605 def test_charmap(self):
1606 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001607 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001608 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001609 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001610 b"foo\xa5bar")
1611
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001612 def test_latin1(self):
1613 # Issue6373
1614 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin1", "surrogateescape"),
1615 b"\xe4\xeb\xef\xf6\xfc")
1616
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001617
Victor Stinner3fed0872010-05-22 02:16:27 +00001618class BomTest(unittest.TestCase):
1619 def test_seek0(self):
1620 data = "1234567890"
1621 tests = ("utf-16",
1622 "utf-16-le",
1623 "utf-16-be",
1624 "utf-32",
1625 "utf-32-le",
1626 "utf-32-be")
1627 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001628 # Check if the BOM is written only once
1629 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001630 f.write(data)
1631 f.write(data)
1632 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001633 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001634 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001635 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001636
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001637 # Check that the BOM is written after a seek(0)
1638 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1639 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001640 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001641 f.seek(0)
1642 f.write(data)
1643 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001644 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001645
1646 # (StreamWriter) Check that the BOM is written after a seek(0)
1647 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1648 f.writer.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001649 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001650 f.writer.seek(0)
1651 f.writer.write(data)
1652 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001653 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001654
1655 # Check that the BOM is not written after a seek() at a position
1656 # different than the start
1657 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1658 f.write(data)
1659 f.seek(f.tell())
1660 f.write(data)
1661 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001662 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001663
1664 # (StreamWriter) Check that the BOM is not written after a seek()
1665 # at a position different than the start
1666 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1667 f.writer.write(data)
1668 f.writer.seek(f.writer.tell())
1669 f.writer.write(data)
1670 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001671 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001672
Victor Stinner3fed0872010-05-22 02:16:27 +00001673
Georg Brandl02524622010-12-02 18:06:51 +00001674bytes_transform_encodings = [
1675 "base64_codec",
1676 "uu_codec",
1677 "quopri_codec",
1678 "hex_codec",
1679]
1680try:
1681 import zlib
1682except ImportError:
1683 pass
1684else:
1685 bytes_transform_encodings.append("zlib_codec")
1686try:
1687 import bz2
1688except ImportError:
1689 pass
1690else:
1691 bytes_transform_encodings.append("bz2_codec")
1692
1693class TransformCodecTest(unittest.TestCase):
1694 def test_basics(self):
1695 binput = bytes(range(256))
1696 ainput = bytearray(binput)
1697 for encoding in bytes_transform_encodings:
1698 # generic codecs interface
1699 (o, size) = codecs.getencoder(encoding)(binput)
1700 self.assertEqual(size, len(binput))
1701 (i, size) = codecs.getdecoder(encoding)(o)
1702 self.assertEqual(size, len(o))
1703 self.assertEqual(i, binput)
1704
1705 # transform interface
1706 boutput = binput.transform(encoding)
1707 aoutput = ainput.transform(encoding)
1708 self.assertEqual(boutput, aoutput)
1709 self.assertIsInstance(boutput, bytes)
1710 self.assertIsInstance(aoutput, bytearray)
1711 bback = boutput.untransform(encoding)
1712 aback = aoutput.untransform(encoding)
1713 self.assertEqual(bback, aback)
1714 self.assertEqual(bback, binput)
1715 self.assertIsInstance(bback, bytes)
1716 self.assertIsInstance(aback, bytearray)
1717
1718 def test_read(self):
1719 for encoding in bytes_transform_encodings:
1720 sin = b"\x80".transform(encoding)
1721 reader = codecs.getreader(encoding)(io.BytesIO(sin))
1722 sout = reader.read()
1723 self.assertEqual(sout, b"\x80")
1724
1725 def test_readline(self):
1726 for encoding in bytes_transform_encodings:
1727 if encoding in ['uu_codec', 'zlib_codec']:
1728 continue
1729 sin = b"\x80".transform(encoding)
1730 reader = codecs.getreader(encoding)(io.BytesIO(sin))
1731 sout = reader.readline()
1732 self.assertEqual(sout, b"\x80")
1733
1734
Fred Drake2e2be372001-09-20 21:33:42 +00001735def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001736 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00001737 UTF32Test,
1738 UTF32LETest,
1739 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001740 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001741 UTF16LETest,
1742 UTF16BETest,
1743 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001744 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001745 UTF7Test,
1746 UTF16ExTest,
1747 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001748 RecodingTest,
1749 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001750 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001751 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001752 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001753 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001754 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001755 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001756 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001757 CharmapTest,
1758 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001759 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001760 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00001761 BomTest,
Georg Brandl02524622010-12-02 18:06:51 +00001762 TransformCodecTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001763 )
Fred Drake2e2be372001-09-20 21:33:42 +00001764
1765
1766if __name__ == "__main__":
1767 test_main()