blob: 521cbce35b480fcd8f686d278bc03c05bc4e05a3 [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Barry Warsaw04f357c2002-07-23 19:04:11 +00002import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00004import sys, _testcapi, io
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000010 def __init__(self, buffer):
11 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000012
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000019 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000020 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwald3abcb012007-04-16 22:10:50 +000026class MixInCheckStateHandling:
27 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000028 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000029 d = codecs.getincrementaldecoder(encoding)()
30 part1 = d.decode(s[:i])
31 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000032 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000033 # Check that the condition stated in the documentation for
34 # IncrementalDecoder.getstate() holds
35 if not state[1]:
36 # reset decoder to the default state without anything buffered
37 d.setstate((state[0][:0], 0))
38 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000039 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000040 # The decoder must return to the same state
41 self.assertEqual(state, d.getstate())
42 # Create a new decoder and set it to the state
43 # we extracted from the old one
44 d = codecs.getincrementaldecoder(encoding)()
45 d.setstate(state)
46 part2 = d.decode(s[i:], True)
47 self.assertEqual(u, part1+part2)
48
49 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000050 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000051 d = codecs.getincrementalencoder(encoding)()
52 part1 = d.encode(u[:i])
53 state = d.getstate()
54 d = codecs.getincrementalencoder(encoding)()
55 d.setstate(state)
56 part2 = d.encode(u[i:], True)
57 self.assertEqual(s, part1+part2)
58
59class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000060 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000061 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000062 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000063 # the StreamReader and check that the results equal the appropriate
64 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000065 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +000066 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000067 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000068 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000069 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000070 result += r.read()
71 self.assertEqual(result, partialresult)
72 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000073 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000074 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000075
Thomas Woutersa9773292006-04-21 09:43:23 +000076 # do the check again, this time using a incremental decoder
77 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000078 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000079 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000080 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000081 self.assertEqual(result, partialresult)
82 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000083 self.assertEqual(d.decode(b"", True), "")
84 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000085
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000086 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000087 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000088 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000089 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000090 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000091 self.assertEqual(result, partialresult)
92 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000093 self.assertEqual(d.decode(b"", True), "")
94 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000095
96 # check iterdecode()
97 encoded = input.encode(self.encoding)
98 self.assertEqual(
99 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000100 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000101 )
102
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000103 def test_readline(self):
104 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000105 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000106 return codecs.getreader(self.encoding)(stream)
107
Walter Dörwaldca199432006-03-06 22:39:12 +0000108 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000109 reader = getreader(input)
110 lines = []
111 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000112 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000113 if not line:
114 break
115 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000116 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000117
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000118 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
119 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
120 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000121 self.assertEqual(readalllines(s, True), sexpected)
122 self.assertEqual(readalllines(s, False), sexpectednoends)
123 self.assertEqual(readalllines(s, True, 10), sexpected)
124 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000125
126 # Test long lines (multiple calls to read() in readline())
127 vw = []
128 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000129 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
130 vw.append((i*200)*"\3042" + lineend)
131 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000132 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
133 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
134
135 # Test lines where the first read might end with \r, so the
136 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000137 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000138 for lineend in "\n \r\n \r \u2028".split():
139 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000140 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000141 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000142 self.assertEqual(
143 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000144 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000145 )
146 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000147 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000148 self.assertEqual(
149 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000150 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000151 )
152
153 def test_bug1175396(self):
154 s = [
155 '<%!--===================================================\r\n',
156 ' BLOG index page: show recent articles,\r\n',
157 ' today\'s articles, or articles of a specific date.\r\n',
158 '========================================================--%>\r\n',
159 '<%@inputencoding="ISO-8859-1"%>\r\n',
160 '<%@pagetemplate=TEMPLATE.y%>\r\n',
161 '<%@import=import frog.util, frog%>\r\n',
162 '<%@import=import frog.objects%>\r\n',
163 '<%@import=from frog.storageerrors import StorageError%>\r\n',
164 '<%\r\n',
165 '\r\n',
166 'import logging\r\n',
167 'log=logging.getLogger("Snakelets.logger")\r\n',
168 '\r\n',
169 '\r\n',
170 'user=self.SessionCtx.user\r\n',
171 'storageEngine=self.SessionCtx.storageEngine\r\n',
172 '\r\n',
173 '\r\n',
174 'def readArticlesFromDate(date, count=None):\r\n',
175 ' entryids=storageEngine.listBlogEntries(date)\r\n',
176 ' entryids.reverse() # descending\r\n',
177 ' if count:\r\n',
178 ' entryids=entryids[:count]\r\n',
179 ' try:\r\n',
180 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
181 ' except StorageError,x:\r\n',
182 ' log.error("Error loading articles: "+str(x))\r\n',
183 ' self.abort("cannot load articles")\r\n',
184 '\r\n',
185 'showdate=None\r\n',
186 '\r\n',
187 'arg=self.Request.getArg()\r\n',
188 'if arg=="today":\r\n',
189 ' #-------------------- TODAY\'S ARTICLES\r\n',
190 ' self.write("<h2>Today\'s articles</h2>")\r\n',
191 ' showdate = frog.util.isodatestr() \r\n',
192 ' entries = readArticlesFromDate(showdate)\r\n',
193 'elif arg=="active":\r\n',
194 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
195 ' self.Yredirect("active.y")\r\n',
196 'elif arg=="login":\r\n',
197 ' #-------------------- LOGIN PAGE redirect\r\n',
198 ' self.Yredirect("login.y")\r\n',
199 'elif arg=="date":\r\n',
200 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
201 ' showdate = self.Request.getParameter("date")\r\n',
202 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
203 ' entries = readArticlesFromDate(showdate)\r\n',
204 'else:\r\n',
205 ' #-------------------- RECENT ARTICLES\r\n',
206 ' self.write("<h2>Recent articles</h2>")\r\n',
207 ' dates=storageEngine.listBlogEntryDates()\r\n',
208 ' if dates:\r\n',
209 ' entries=[]\r\n',
210 ' SHOWAMOUNT=10\r\n',
211 ' for showdate in dates:\r\n',
212 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
213 ' if len(entries)>=SHOWAMOUNT:\r\n',
214 ' break\r\n',
215 ' \r\n',
216 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000217 stream = io.BytesIO("".join(s).encode(self.encoding))
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000218 reader = codecs.getreader(self.encoding)(stream)
219 for (i, line) in enumerate(reader):
220 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000221
222 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000223 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000224 writer = codecs.getwriter(self.encoding)(q)
225 reader = codecs.getreader(self.encoding)(q)
226
227 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000228 writer.write("foo\r")
229 self.assertEqual(reader.readline(keepends=False), "foo")
230 writer.write("\nbar\r")
231 self.assertEqual(reader.readline(keepends=False), "")
232 self.assertEqual(reader.readline(keepends=False), "bar")
233 writer.write("baz")
234 self.assertEqual(reader.readline(keepends=False), "baz")
235 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000236
237 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000238 writer.write("foo\r")
239 self.assertEqual(reader.readline(keepends=True), "foo\r")
240 writer.write("\nbar\r")
241 self.assertEqual(reader.readline(keepends=True), "\n")
242 self.assertEqual(reader.readline(keepends=True), "bar\r")
243 writer.write("baz")
244 self.assertEqual(reader.readline(keepends=True), "baz")
245 self.assertEqual(reader.readline(keepends=True), "")
246 writer.write("foo\r\n")
247 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000248
Walter Dörwald9fa09462005-01-10 12:01:39 +0000249 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000250 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
251 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
252 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000253
254 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000255 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000256 reader = codecs.getreader(self.encoding)(stream)
257 self.assertEqual(reader.readline(), s1)
258 self.assertEqual(reader.readline(), s2)
259 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000260 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000261
262 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000263 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
264 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
265 s3 = "stillokay:bbbbxx\r\n"
266 s4 = "broken!!!!badbad\r\n"
267 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000268
269 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000270 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000271 reader = codecs.getreader(self.encoding)(stream)
272 self.assertEqual(reader.readline(), s1)
273 self.assertEqual(reader.readline(), s2)
274 self.assertEqual(reader.readline(), s3)
275 self.assertEqual(reader.readline(), s4)
276 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000277 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000278
Walter Dörwald41980ca2007-08-16 21:55:45 +0000279class UTF32Test(ReadTest):
280 encoding = "utf-32"
281
282 spamle = (b'\xff\xfe\x00\x00'
283 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
284 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
285 spambe = (b'\x00\x00\xfe\xff'
286 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
287 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
288
289 def test_only_one_bom(self):
290 _,_,reader,writer = codecs.lookup(self.encoding)
291 # encode some stream
292 s = io.BytesIO()
293 f = writer(s)
294 f.write("spam")
295 f.write("spam")
296 d = s.getvalue()
297 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000298 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000299 # try to read it back
300 s = io.BytesIO(d)
301 f = reader(s)
302 self.assertEquals(f.read(), "spamspam")
303
304 def test_badbom(self):
305 s = io.BytesIO(4*b"\xff")
306 f = codecs.getreader(self.encoding)(s)
307 self.assertRaises(UnicodeError, f.read)
308
309 s = io.BytesIO(8*b"\xff")
310 f = codecs.getreader(self.encoding)(s)
311 self.assertRaises(UnicodeError, f.read)
312
313 def test_partial(self):
314 self.check_partial(
315 "\x00\xff\u0100\uffff",
316 [
317 "", # first byte of BOM read
318 "", # second byte of BOM read
319 "", # third byte of BOM read
320 "", # fourth byte of BOM read => byteorder known
321 "",
322 "",
323 "",
324 "\x00",
325 "\x00",
326 "\x00",
327 "\x00",
328 "\x00\xff",
329 "\x00\xff",
330 "\x00\xff",
331 "\x00\xff",
332 "\x00\xff\u0100",
333 "\x00\xff\u0100",
334 "\x00\xff\u0100",
335 "\x00\xff\u0100",
336 "\x00\xff\u0100\uffff",
337 ]
338 )
339
Georg Brandl791f4e12009-09-17 11:41:24 +0000340 def test_handlers(self):
341 self.assertEqual(('\ufffd', 1),
342 codecs.utf_32_decode(b'\x01', 'replace', True))
343 self.assertEqual(('', 1),
344 codecs.utf_32_decode(b'\x01', 'ignore', True))
345
Walter Dörwald41980ca2007-08-16 21:55:45 +0000346 def test_errors(self):
347 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
348 b"\xff", "strict", True)
349
350 def test_decoder_state(self):
351 self.check_state_handling_decode(self.encoding,
352 "spamspam", self.spamle)
353 self.check_state_handling_decode(self.encoding,
354 "spamspam", self.spambe)
355
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000356 def test_issue8941(self):
357 # Issue #8941: insufficient result allocation when decoding into
358 # surrogate pairs on UCS-2 builds.
359 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
360 self.assertEqual('\U00010000' * 1024,
361 codecs.utf_32_decode(encoded_le)[0])
362 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
363 self.assertEqual('\U00010000' * 1024,
364 codecs.utf_32_decode(encoded_be)[0])
365
Walter Dörwald41980ca2007-08-16 21:55:45 +0000366class UTF32LETest(ReadTest):
367 encoding = "utf-32-le"
368
369 def test_partial(self):
370 self.check_partial(
371 "\x00\xff\u0100\uffff",
372 [
373 "",
374 "",
375 "",
376 "\x00",
377 "\x00",
378 "\x00",
379 "\x00",
380 "\x00\xff",
381 "\x00\xff",
382 "\x00\xff",
383 "\x00\xff",
384 "\x00\xff\u0100",
385 "\x00\xff\u0100",
386 "\x00\xff\u0100",
387 "\x00\xff\u0100",
388 "\x00\xff\u0100\uffff",
389 ]
390 )
391
392 def test_simple(self):
393 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
394
395 def test_errors(self):
396 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
397 b"\xff", "strict", True)
398
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000399 def test_issue8941(self):
400 # Issue #8941: insufficient result allocation when decoding into
401 # surrogate pairs on UCS-2 builds.
402 encoded = b'\x00\x00\x01\x00' * 1024
403 self.assertEqual('\U00010000' * 1024,
404 codecs.utf_32_le_decode(encoded)[0])
405
Walter Dörwald41980ca2007-08-16 21:55:45 +0000406class UTF32BETest(ReadTest):
407 encoding = "utf-32-be"
408
409 def test_partial(self):
410 self.check_partial(
411 "\x00\xff\u0100\uffff",
412 [
413 "",
414 "",
415 "",
416 "\x00",
417 "\x00",
418 "\x00",
419 "\x00",
420 "\x00\xff",
421 "\x00\xff",
422 "\x00\xff",
423 "\x00\xff",
424 "\x00\xff\u0100",
425 "\x00\xff\u0100",
426 "\x00\xff\u0100",
427 "\x00\xff\u0100",
428 "\x00\xff\u0100\uffff",
429 ]
430 )
431
432 def test_simple(self):
433 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
434
435 def test_errors(self):
436 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
437 b"\xff", "strict", True)
438
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000439 def test_issue8941(self):
440 # Issue #8941: insufficient result allocation when decoding into
441 # surrogate pairs on UCS-2 builds.
442 encoded = b'\x00\x01\x00\x00' * 1024
443 self.assertEqual('\U00010000' * 1024,
444 codecs.utf_32_be_decode(encoded)[0])
445
446
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000447class UTF16Test(ReadTest):
448 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000449
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000450 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
451 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000452
453 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000454 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000455 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000456 s = io.BytesIO()
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000457 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000458 f.write("spam")
459 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000460 d = s.getvalue()
461 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000462 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000463 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000464 s = io.BytesIO(d)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000465 f = reader(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000466 self.assertEquals(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000467
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000468 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000469 s = io.BytesIO(b"\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000470 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000471 self.assertRaises(UnicodeError, f.read)
472
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000473 s = io.BytesIO(b"\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000474 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000475 self.assertRaises(UnicodeError, f.read)
476
Walter Dörwald69652032004-09-07 20:24:22 +0000477 def test_partial(self):
478 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000479 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000480 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000481 "", # first byte of BOM read
482 "", # second byte of BOM read => byteorder known
483 "",
484 "\x00",
485 "\x00",
486 "\x00\xff",
487 "\x00\xff",
488 "\x00\xff\u0100",
489 "\x00\xff\u0100",
490 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000491 ]
492 )
493
Georg Brandl791f4e12009-09-17 11:41:24 +0000494 def test_handlers(self):
495 self.assertEqual(('\ufffd', 1),
496 codecs.utf_16_decode(b'\x01', 'replace', True))
497 self.assertEqual(('', 1),
498 codecs.utf_16_decode(b'\x01', 'ignore', True))
499
Walter Dörwalde22d3392005-11-17 08:52:34 +0000500 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000501 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000502 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000503
504 def test_decoder_state(self):
505 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000506 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000507 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000508 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000509
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000510 def test_bug691291(self):
511 # Files are always opened in binary mode, even if no binary mode was
512 # specified. This means that no automatic conversion of '\n' is done
513 # on reading and writing.
514 s1 = 'Hello\r\nworld\r\n'
515
516 s = s1.encode(self.encoding)
517 try:
518 with open(support.TESTFN, 'wb') as fp:
519 fp.write(s)
520 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
521 self.assertEqual(reader.read(), s1)
522 finally:
523 support.unlink(support.TESTFN)
524
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000525class UTF16LETest(ReadTest):
526 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000527
528 def test_partial(self):
529 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000530 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000531 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000532 "",
533 "\x00",
534 "\x00",
535 "\x00\xff",
536 "\x00\xff",
537 "\x00\xff\u0100",
538 "\x00\xff\u0100",
539 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000540 ]
541 )
542
Walter Dörwalde22d3392005-11-17 08:52:34 +0000543 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000544 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000545 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000546
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000547class UTF16BETest(ReadTest):
548 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000549
550 def test_partial(self):
551 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000552 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000553 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000554 "",
555 "\x00",
556 "\x00",
557 "\x00\xff",
558 "\x00\xff",
559 "\x00\xff\u0100",
560 "\x00\xff\u0100",
561 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000562 ]
563 )
564
Walter Dörwalde22d3392005-11-17 08:52:34 +0000565 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000566 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000567 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000568
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000569class UTF8Test(ReadTest):
570 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000571
572 def test_partial(self):
573 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000574 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000575 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000576 "\x00",
577 "\x00",
578 "\x00\xff",
579 "\x00\xff",
580 "\x00\xff\u07ff",
581 "\x00\xff\u07ff",
582 "\x00\xff\u07ff",
583 "\x00\xff\u07ff\u0800",
584 "\x00\xff\u07ff\u0800",
585 "\x00\xff\u07ff\u0800",
586 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000587 ]
588 )
589
Walter Dörwald3abcb012007-04-16 22:10:50 +0000590 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000591 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000592 self.check_state_handling_decode(self.encoding,
593 u, u.encode(self.encoding))
594
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000595 def test_lone_surrogates(self):
596 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
597 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000598 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
599 b'[\\udc80]')
600 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
601 b'[&#56448;]')
602 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
603 b'[\x80]')
604 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
605 b'[]')
606 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
607 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000608
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000609 def test_surrogatepass_handler(self):
610 self.assertEquals("abc\ud800def".encode("utf-8", "surrogatepass"),
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000611 b"abc\xed\xa0\x80def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000612 self.assertEquals(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000613 "abc\ud800def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000614 self.assertTrue(codecs.lookup_error("surrogatepass"))
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000615
Walter Dörwalde22d3392005-11-17 08:52:34 +0000616class UTF7Test(ReadTest):
617 encoding = "utf-7"
618
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000619 def test_partial(self):
620 self.check_partial(
621 "a+-b",
622 [
623 "a",
624 "a",
625 "a+",
626 "a+-",
627 "a+-b",
628 ]
629 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000630
631class UTF16ExTest(unittest.TestCase):
632
633 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000634 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000635
636 def test_bad_args(self):
637 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
638
639class ReadBufferTest(unittest.TestCase):
640
641 def test_array(self):
642 import array
643 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000644 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000645 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000646 )
647
648 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000649 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000650
651 def test_bad_args(self):
652 self.assertRaises(TypeError, codecs.readbuffer_encode)
653 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
654
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000655class UTF8SigTest(ReadTest):
656 encoding = "utf-8-sig"
657
658 def test_partial(self):
659 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000660 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000661 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000662 "",
663 "",
664 "", # First BOM has been read and skipped
665 "",
666 "",
667 "\ufeff", # Second BOM has been read and emitted
668 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000669 "\ufeff\x00", # First byte of encoded "\xff" read
670 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
671 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
672 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000673 "\ufeff\x00\xff\u07ff",
674 "\ufeff\x00\xff\u07ff",
675 "\ufeff\x00\xff\u07ff\u0800",
676 "\ufeff\x00\xff\u07ff\u0800",
677 "\ufeff\x00\xff\u07ff\u0800",
678 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000679 ]
680 )
681
Thomas Wouters89f507f2006-12-13 04:49:30 +0000682 def test_bug1601501(self):
683 # SF bug #1601501: check that the codec works with a buffer
Antoine Pitrou616d2852008-08-19 22:09:34 +0000684 self.assertEquals(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000685
Walter Dörwald3abcb012007-04-16 22:10:50 +0000686 def test_bom(self):
687 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000688 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000689 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
690
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000691 def test_stream_bom(self):
692 unistring = "ABC\u00A1\u2200XYZ"
693 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
694
695 reader = codecs.getreader("utf-8-sig")
696 for sizehint in [None] + list(range(1, 11)) + \
697 [64, 128, 256, 512, 1024]:
698 istream = reader(io.BytesIO(bytestring))
699 ostream = io.StringIO()
700 while 1:
701 if sizehint is not None:
702 data = istream.read(sizehint)
703 else:
704 data = istream.read()
705
706 if not data:
707 break
708 ostream.write(data)
709
710 got = ostream.getvalue()
711 self.assertEqual(got, unistring)
712
713 def test_stream_bare(self):
714 unistring = "ABC\u00A1\u2200XYZ"
715 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
716
717 reader = codecs.getreader("utf-8-sig")
718 for sizehint in [None] + list(range(1, 11)) + \
719 [64, 128, 256, 512, 1024]:
720 istream = reader(io.BytesIO(bytestring))
721 ostream = io.StringIO()
722 while 1:
723 if sizehint is not None:
724 data = istream.read(sizehint)
725 else:
726 data = istream.read()
727
728 if not data:
729 break
730 ostream.write(data)
731
732 got = ostream.getvalue()
733 self.assertEqual(got, unistring)
734
735class EscapeDecodeTest(unittest.TestCase):
736 def test_empty(self):
737 self.assertEquals(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000738
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000739class RecodingTest(unittest.TestCase):
740 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000741 f = io.BytesIO()
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000742 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000743 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000744 f2.close()
745 # Python used to crash on this at exit because of a refcount
746 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000747
Martin v. Löwis2548c732003-04-18 10:39:54 +0000748# From RFC 3492
749punycode_testcases = [
750 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000751 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
752 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000753 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000754 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000755 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000756 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000757 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000758 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000759 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000760 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000761 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
762 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
763 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000764 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000765 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000766 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
767 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
768 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000769 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000770 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000771 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000772 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
773 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
774 "\u0939\u0948\u0902",
775 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000776
777 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000778 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000779 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
780 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000781
782 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000783 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
784 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
785 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000786 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
787 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000788
789 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000790 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
791 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
792 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
793 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000794 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000795
796 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000797 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
798 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
799 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
800 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
801 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000802 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000803
804 # (K) Vietnamese:
805 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
806 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000807 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
808 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
809 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
810 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000811 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000812
Martin v. Löwis2548c732003-04-18 10:39:54 +0000813 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000814 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000815 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000816
Martin v. Löwis2548c732003-04-18 10:39:54 +0000817 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000818 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
819 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
820 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000821 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000822
823 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000824 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
825 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
826 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000827 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000828
829 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000830 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000831 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000832
833 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000834 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
835 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000836 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000837
838 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000839 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000840 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000841
842 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000843 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000844 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000845
846 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000847 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
848 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000849 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000850 ]
851
852for i in punycode_testcases:
853 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000854 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000855
856class PunycodeTest(unittest.TestCase):
857 def test_encode(self):
858 for uni, puny in punycode_testcases:
859 # Need to convert both strings to lower case, since
860 # some of the extended encodings use upper case, but our
861 # code produces only lower case. Converting just puny to
862 # lower is also insufficient, since some of the input characters
863 # are upper case.
Walter Dörwalda4c61282007-05-10 12:36:25 +0000864 self.assertEquals(
865 str(uni.encode("punycode"), "ascii").lower(),
866 str(puny, "ascii").lower()
867 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000868
869 def test_decode(self):
870 for uni, puny in punycode_testcases:
871 self.assertEquals(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000872 puny = puny.decode("ascii").encode("ascii")
873 self.assertEquals(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000874
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000875class UnicodeInternalTest(unittest.TestCase):
876 def test_bug1251300(self):
877 # Decoding with unicode_internal used to not correctly handle "code
878 # points" above 0x10ffff on UCS-4 builds.
879 if sys.maxunicode > 0xffff:
880 ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000881 (b"\x00\x10\xff\xff", "\U0010ffff"),
882 (b"\x00\x00\x01\x01", "\U00000101"),
883 (b"", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000884 ]
885 not_ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000886 b"\x7f\xff\xff\xff",
887 b"\x80\x00\x00\x00",
888 b"\x81\x00\x00\x00",
889 b"\x00",
890 b"\x00\x00\x00\x00\x00",
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000891 ]
892 for internal, uni in ok:
893 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000894 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000895 self.assertEquals(uni, internal.decode("unicode_internal"))
896 for internal in not_ok:
897 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000898 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000899 self.assertRaises(UnicodeDecodeError, internal.decode,
900 "unicode_internal")
901
902 def test_decode_error_attributes(self):
903 if sys.maxunicode > 0xffff:
904 try:
Walter Dörwald092a2252007-06-07 11:26:16 +0000905 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +0000906 except UnicodeDecodeError as ex:
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000907 self.assertEquals("unicode_internal", ex.encoding)
Walter Dörwald092a2252007-06-07 11:26:16 +0000908 self.assertEquals(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000909 self.assertEquals(4, ex.start)
910 self.assertEquals(8, ex.end)
911 else:
912 self.fail()
913
914 def test_decode_callback(self):
915 if sys.maxunicode > 0xffff:
916 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
917 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossum98297ee2007-11-06 21:34:58 +0000918 ab = "ab".encode("unicode_internal").decode()
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000919 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
920 "ascii"),
921 "UnicodeInternalTest")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000922 self.assertEquals(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000923
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000924 def test_encode_length(self):
925 # Issue 3739
926 encoder = codecs.getencoder("unicode_internal")
927 self.assertEquals(encoder("a")[1], 1)
928 self.assertEquals(encoder("\xe9\u0142")[1], 2)
929
Philip Jenvey66a1bd52010-04-05 03:05:24 +0000930 self.assertEquals(codecs.escape_encode(br'\x00')[1], 4)
931
Martin v. Löwis2548c732003-04-18 10:39:54 +0000932# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
933nameprep_tests = [
934 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000935 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
936 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
937 b'\xb8\x8f\xef\xbb\xbf',
938 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000939 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000940 (b'CAFE',
941 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000942 # 3.3 Case folding 8bit U+00DF (german sharp s).
943 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000944 (b'\xc3\x9f',
945 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000946 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000947 (b'\xc4\xb0',
948 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000949 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000950 (b'\xc5\x83\xcd\xba',
951 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000952 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
953 # XXX: skip this as it fails in UCS-2 mode
954 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
955 # 'telc\xe2\x88\x95kg\xcf\x83'),
956 (None, None),
957 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000958 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
959 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000960 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000961 (b'\xe1\xbe\xb7',
962 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000963 # 3.9 Self-reverting case folding U+01F0 and normalization.
964 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000965 (b'\xc7\xb0',
966 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000967 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000968 (b'\xce\x90',
969 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000970 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000971 (b'\xce\xb0',
972 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000973 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000974 (b'\xe1\xba\x96',
975 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000976 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000977 (b'\xe1\xbd\x96',
978 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000979 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000980 (b' ',
981 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000982 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000983 (b'\xc2\xa0',
984 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000985 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000986 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000987 None),
988 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000989 (b'\xe2\x80\x80',
990 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000991 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000992 (b'\xe2\x80\x8b',
993 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000994 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000995 (b'\xe3\x80\x80',
996 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000997 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000998 (b'\x10\x7f',
999 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001000 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001001 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001002 None),
1003 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001004 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001005 None),
1006 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001007 (b'\xef\xbb\xbf',
1008 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001009 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001010 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001011 None),
1012 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001013 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001014 None),
1015 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001016 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001017 None),
1018 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001019 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001020 None),
1021 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001022 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001023 None),
1024 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001025 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001026 None),
1027 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001028 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001029 None),
1030 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001031 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001032 None),
1033 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001034 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001035 None),
1036 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001037 (b'\xcd\x81',
1038 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001039 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001040 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001041 None),
1042 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001043 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001044 None),
1045 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001046 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001047 None),
1048 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001049 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001050 None),
1051 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001052 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001053 None),
1054 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001055 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001056 None),
1057 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001058 (b'foo\xef\xb9\xb6bar',
1059 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001060 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001061 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001062 None),
1063 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001064 (b'\xd8\xa71\xd8\xa8',
1065 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001066 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001067 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001068 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001069 # None),
1070 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001071 # 3.44 Larger test (shrinking).
1072 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001073 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1074 b'\xaa\xce\xb0\xe2\x80\x80',
1075 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001076 # 3.45 Larger test (expanding).
1077 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001078 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1079 b'\x80',
1080 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1081 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1082 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001083 ]
1084
1085
1086class NameprepTest(unittest.TestCase):
1087 def test_nameprep(self):
1088 from encodings.idna import nameprep
1089 for pos, (orig, prepped) in enumerate(nameprep_tests):
1090 if orig is None:
1091 # Skipped
1092 continue
1093 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001094 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001095 if prepped is None:
1096 # Input contains prohibited characters
1097 self.assertRaises(UnicodeError, nameprep, orig)
1098 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001099 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001100 try:
1101 self.assertEquals(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001102 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001103 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001104
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001105class IDNACodecTest(unittest.TestCase):
1106 def test_builtin_decode(self):
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001107 self.assertEquals(str(b"python.org", "idna"), "python.org")
1108 self.assertEquals(str(b"python.org.", "idna"), "python.org.")
1109 self.assertEquals(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1110 self.assertEquals(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001111
1112 def test_builtin_encode(self):
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001113 self.assertEquals("python.org".encode("idna"), b"python.org")
1114 self.assertEquals("python.org.".encode("idna"), b"python.org.")
1115 self.assertEquals("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1116 self.assertEquals("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001117
Martin v. Löwis8b595142005-08-25 11:03:38 +00001118 def test_stream(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001119 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001120 r.read(3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001121 self.assertEquals(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001122
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001123 def test_incremental_decode(self):
1124 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001125 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001126 "python.org"
1127 )
1128 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001129 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001130 "python.org."
1131 )
1132 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001133 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001134 "pyth\xf6n.org."
1135 )
1136 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001137 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001138 "pyth\xf6n.org."
1139 )
1140
1141 decoder = codecs.getincrementaldecoder("idna")()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001142 self.assertEquals(decoder.decode(b"xn--xam", ), "")
1143 self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1144 self.assertEquals(decoder.decode(b"rg"), "")
1145 self.assertEquals(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001146
1147 decoder.reset()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001148 self.assertEquals(decoder.decode(b"xn--xam", ), "")
1149 self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1150 self.assertEquals(decoder.decode(b"rg."), "org.")
1151 self.assertEquals(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001152
1153 def test_incremental_encode(self):
1154 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001155 b"".join(codecs.iterencode("python.org", "idna")),
1156 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001157 )
1158 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001159 b"".join(codecs.iterencode("python.org.", "idna")),
1160 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001161 )
1162 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001163 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1164 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001165 )
1166 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001167 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1168 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001169 )
1170
1171 encoder = codecs.getincrementalencoder("idna")()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001172 self.assertEquals(encoder.encode("\xe4x"), b"")
1173 self.assertEquals(encoder.encode("ample.org"), b"xn--xample-9ta.")
1174 self.assertEquals(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001175
1176 encoder.reset()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001177 self.assertEquals(encoder.encode("\xe4x"), b"")
1178 self.assertEquals(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1179 self.assertEquals(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001180
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001181class CodecsModuleTest(unittest.TestCase):
1182
1183 def test_decode(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001184 self.assertEquals(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001185 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001186 self.assertRaises(TypeError, codecs.decode)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001187 self.assertEquals(codecs.decode(b'abc'), 'abc')
1188 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001189
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001190 def test_encode(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001191 self.assertEquals(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001192 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001193 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001194 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001195 self.assertEquals(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001196 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001197
1198 def test_register(self):
1199 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001200 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001201
1202 def test_lookup(self):
1203 self.assertRaises(TypeError, codecs.lookup)
1204 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001205 self.assertRaises(LookupError, codecs.lookup, " ")
1206
1207 def test_getencoder(self):
1208 self.assertRaises(TypeError, codecs.getencoder)
1209 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1210
1211 def test_getdecoder(self):
1212 self.assertRaises(TypeError, codecs.getdecoder)
1213 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1214
1215 def test_getreader(self):
1216 self.assertRaises(TypeError, codecs.getreader)
1217 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1218
1219 def test_getwriter(self):
1220 self.assertRaises(TypeError, codecs.getwriter)
1221 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001222
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001223class StreamReaderTest(unittest.TestCase):
1224
1225 def setUp(self):
1226 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001227 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001228
1229 def test_readlines(self):
1230 f = self.reader(self.stream)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001231 self.assertEquals(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001232
Thomas Wouters89f507f2006-12-13 04:49:30 +00001233class EncodedFileTest(unittest.TestCase):
1234
1235 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001236 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001237 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001238 self.assertEquals(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001239
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001240 f = io.BytesIO()
Thomas Wouters89f507f2006-12-13 04:49:30 +00001241 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001242 ef.write(b'\xc3\xbc')
1243 self.assertEquals(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001244
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001245all_unicode_encodings = [
1246 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001247 "big5",
1248 "big5hkscs",
1249 "charmap",
1250 "cp037",
1251 "cp1006",
1252 "cp1026",
1253 "cp1140",
1254 "cp1250",
1255 "cp1251",
1256 "cp1252",
1257 "cp1253",
1258 "cp1254",
1259 "cp1255",
1260 "cp1256",
1261 "cp1257",
1262 "cp1258",
1263 "cp424",
1264 "cp437",
1265 "cp500",
1266 "cp737",
1267 "cp775",
1268 "cp850",
1269 "cp852",
1270 "cp855",
1271 "cp856",
1272 "cp857",
1273 "cp860",
1274 "cp861",
1275 "cp862",
1276 "cp863",
1277 "cp864",
1278 "cp865",
1279 "cp866",
1280 "cp869",
1281 "cp874",
1282 "cp875",
1283 "cp932",
1284 "cp949",
1285 "cp950",
1286 "euc_jis_2004",
1287 "euc_jisx0213",
1288 "euc_jp",
1289 "euc_kr",
1290 "gb18030",
1291 "gb2312",
1292 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001293 "hp_roman8",
1294 "hz",
1295 "idna",
1296 "iso2022_jp",
1297 "iso2022_jp_1",
1298 "iso2022_jp_2",
1299 "iso2022_jp_2004",
1300 "iso2022_jp_3",
1301 "iso2022_jp_ext",
1302 "iso2022_kr",
1303 "iso8859_1",
1304 "iso8859_10",
1305 "iso8859_11",
1306 "iso8859_13",
1307 "iso8859_14",
1308 "iso8859_15",
1309 "iso8859_16",
1310 "iso8859_2",
1311 "iso8859_3",
1312 "iso8859_4",
1313 "iso8859_5",
1314 "iso8859_6",
1315 "iso8859_7",
1316 "iso8859_8",
1317 "iso8859_9",
1318 "johab",
1319 "koi8_r",
1320 "koi8_u",
1321 "latin_1",
1322 "mac_cyrillic",
1323 "mac_greek",
1324 "mac_iceland",
1325 "mac_latin2",
1326 "mac_roman",
1327 "mac_turkish",
1328 "palmos",
1329 "ptcp154",
1330 "punycode",
1331 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001332 "shift_jis",
1333 "shift_jis_2004",
1334 "shift_jisx0213",
1335 "tis_620",
1336 "unicode_escape",
1337 "unicode_internal",
1338 "utf_16",
1339 "utf_16_be",
1340 "utf_16_le",
1341 "utf_7",
1342 "utf_8",
1343]
1344
1345if hasattr(codecs, "mbcs_encode"):
1346 all_unicode_encodings.append("mbcs")
1347
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001348# The following encoding is not tested, because it's not supposed
1349# to work:
1350# "undefined"
1351
1352# The following encodings don't work in stateful mode
1353broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001354 "punycode",
1355 "unicode_internal"
1356]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001357broken_incremental_coders = broken_unicode_with_streams + [
1358 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001359]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001360
Walter Dörwald3abcb012007-04-16 22:10:50 +00001361class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001362 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001363 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001364 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001365 name = codecs.lookup(encoding).name
1366 if encoding.endswith("_codec"):
1367 name += "_codec"
1368 elif encoding == "latin_1":
1369 name = "latin_1"
1370 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001371 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001372 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001373 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001374 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1375
1376 if encoding not in broken_unicode_with_streams:
1377 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001378 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001379 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001380 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001381 for c in s:
1382 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001383 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001384 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001385 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001386 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001387 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001388 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001389 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001390 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001391 decodedresult += reader.read()
1392 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1393
Thomas Wouters89f507f2006-12-13 04:49:30 +00001394 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001395 # check incremental decoder/encoder (fetched via the Python
1396 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001397 try:
1398 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001399 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001400 except LookupError: # no IncrementalEncoder
1401 pass
1402 else:
1403 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001404 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001405 for c in s:
1406 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001407 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001408 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001409 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001410 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001411 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001412 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001413 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1414
1415 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001416 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001417 for c in s:
1418 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001419 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001420 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001421 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001422 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001423 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001424 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001425 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1426
1427 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001428 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001429 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1430
1431 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001432 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1433 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001434
Victor Stinner554f3f02010-06-16 23:33:54 +00001435 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001436 # check incremental decoder/encoder with errors argument
1437 try:
1438 encoder = codecs.getincrementalencoder(encoding)("ignore")
1439 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1440 except LookupError: # no IncrementalEncoder
1441 pass
1442 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001443 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001444 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001445 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001446 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1447
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001448 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001449 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001450 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001451 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1452
Walter Dörwald729c31f2005-03-14 19:06:30 +00001453 def test_seek(self):
1454 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001455 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001456 for encoding in all_unicode_encodings:
1457 if encoding == "idna": # FIXME: See SF bug #1163178
1458 continue
1459 if encoding in broken_unicode_with_streams:
1460 continue
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001461 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001462 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001463 # Test that calling seek resets the internal codec state and buffers
1464 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001465 data = reader.read()
1466 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001467
Walter Dörwalde22d3392005-11-17 08:52:34 +00001468 def test_bad_decode_args(self):
1469 for encoding in all_unicode_encodings:
1470 decoder = codecs.getdecoder(encoding)
1471 self.assertRaises(TypeError, decoder)
1472 if encoding not in ("idna", "punycode"):
1473 self.assertRaises(TypeError, decoder, 42)
1474
1475 def test_bad_encode_args(self):
1476 for encoding in all_unicode_encodings:
1477 encoder = codecs.getencoder(encoding)
1478 self.assertRaises(TypeError, encoder)
1479
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001480 def test_encoding_map_type_initialized(self):
1481 from encodings import cp1140
1482 # This used to crash, we are only verifying there's no crash.
1483 table_type = type(cp1140.encoding_table)
1484 self.assertEqual(table_type, table_type)
1485
Walter Dörwald3abcb012007-04-16 22:10:50 +00001486 def test_decoder_state(self):
1487 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001488 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001489 for encoding in all_unicode_encodings:
1490 if encoding not in broken_incremental_coders:
1491 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1492 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1493
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001494class CharmapTest(unittest.TestCase):
1495 def test_decode_with_string_map(self):
1496 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001497 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001498 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001499 )
1500
1501 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001502 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001503 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001504 )
1505
1506 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001507 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001508 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001509 )
1510
1511 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001512 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001513 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001514 )
1515
1516 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001517 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001518 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001519 )
1520
Guido van Rossum805365e2007-05-07 22:24:25 +00001521 allbytes = bytes(range(256))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001522 self.assertEquals(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001523 codecs.charmap_decode(allbytes, "ignore", ""),
1524 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001525 )
1526
Thomas Wouters89f507f2006-12-13 04:49:30 +00001527class WithStmtTest(unittest.TestCase):
1528 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001529 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001530 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001531 self.assertEquals(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001532
1533 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001534 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001535 info = codecs.lookup("utf-8")
1536 with codecs.StreamReaderWriter(f, info.streamreader,
1537 info.streamwriter, 'strict') as srw:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001538 self.assertEquals(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001539
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001540class TypesTest(unittest.TestCase):
1541 def test_decode_unicode(self):
1542 # Most decoders don't accept unicode input
1543 decoders = [
1544 codecs.utf_7_decode,
1545 codecs.utf_8_decode,
1546 codecs.utf_16_le_decode,
1547 codecs.utf_16_be_decode,
1548 codecs.utf_16_ex_decode,
1549 codecs.utf_32_decode,
1550 codecs.utf_32_le_decode,
1551 codecs.utf_32_be_decode,
1552 codecs.utf_32_ex_decode,
1553 codecs.latin_1_decode,
1554 codecs.ascii_decode,
1555 codecs.charmap_decode,
1556 ]
1557 if hasattr(codecs, "mbcs_decode"):
1558 decoders.append(codecs.mbcs_decode)
1559 for decoder in decoders:
1560 self.assertRaises(TypeError, decoder, "xxx")
1561
1562 def test_unicode_escape(self):
1563 # Escape-decoding an unicode string is supported ang gives the same
1564 # result as decoding the equivalent ASCII bytes string.
1565 self.assertEquals(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1566 self.assertEquals(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1567 self.assertEquals(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1568 self.assertEquals(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1569
Martin v. Löwis43c57782009-05-10 08:15:24 +00001570class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001571
1572 def test_utf8(self):
1573 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001574 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001575 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001576 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001577 b"foo\x80bar")
1578 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001579 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001580 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001581 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001582 b"\xed\xb0\x80")
1583
1584 def test_ascii(self):
1585 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001586 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001587 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001588 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001589 b"foo\x80bar")
1590
1591 def test_charmap(self):
1592 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001593 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001594 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001595 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001596 b"foo\xa5bar")
1597
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001598 def test_latin1(self):
1599 # Issue6373
1600 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin1", "surrogateescape"),
1601 b"\xe4\xeb\xef\xf6\xfc")
1602
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001603
Victor Stinner3fed0872010-05-22 02:16:27 +00001604class BomTest(unittest.TestCase):
1605 def test_seek0(self):
1606 data = "1234567890"
1607 tests = ("utf-16",
1608 "utf-16-le",
1609 "utf-16-be",
1610 "utf-32",
1611 "utf-32-le",
1612 "utf-32-be")
1613 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001614 # Check if the BOM is written only once
1615 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001616 f.write(data)
1617 f.write(data)
1618 f.seek(0)
1619 self.assertEquals(f.read(), data * 2)
1620 f.seek(0)
1621 self.assertEquals(f.read(), data * 2)
1622
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001623 # Check that the BOM is written after a seek(0)
1624 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1625 f.write(data[0])
1626 self.assertNotEquals(f.tell(), 0)
1627 f.seek(0)
1628 f.write(data)
1629 f.seek(0)
1630 self.assertEquals(f.read(), data)
1631
1632 # (StreamWriter) Check that the BOM is written after a seek(0)
1633 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1634 f.writer.write(data[0])
1635 self.assertNotEquals(f.writer.tell(), 0)
1636 f.writer.seek(0)
1637 f.writer.write(data)
1638 f.seek(0)
1639 self.assertEquals(f.read(), data)
1640
1641 # Check that the BOM is not written after a seek() at a position
1642 # different than the start
1643 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1644 f.write(data)
1645 f.seek(f.tell())
1646 f.write(data)
1647 f.seek(0)
1648 self.assertEquals(f.read(), data * 2)
1649
1650 # (StreamWriter) Check that the BOM is not written after a seek()
1651 # at a position different than the start
1652 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1653 f.writer.write(data)
1654 f.writer.seek(f.writer.tell())
1655 f.writer.write(data)
1656 f.seek(0)
1657 self.assertEquals(f.read(), data * 2)
1658
Victor Stinner3fed0872010-05-22 02:16:27 +00001659
Fred Drake2e2be372001-09-20 21:33:42 +00001660def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001661 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00001662 UTF32Test,
1663 UTF32LETest,
1664 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001665 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001666 UTF16LETest,
1667 UTF16BETest,
1668 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001669 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001670 UTF7Test,
1671 UTF16ExTest,
1672 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001673 RecodingTest,
1674 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001675 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001676 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001677 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001678 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001679 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001680 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001681 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001682 CharmapTest,
1683 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001684 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001685 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00001686 BomTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001687 )
Fred Drake2e2be372001-09-20 21:33:42 +00001688
1689
1690if __name__ == "__main__":
1691 test_main()