blob: 911d58f8f674d8763e719eb090044e0917fbba99 [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Barry Warsaw04f357c2002-07-23 19:04:11 +00002import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00004import sys, _testcapi, io
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000010 def __init__(self, buffer):
11 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000012
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000019 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000020 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwald3abcb012007-04-16 22:10:50 +000026class MixInCheckStateHandling:
27 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000028 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000029 d = codecs.getincrementaldecoder(encoding)()
30 part1 = d.decode(s[:i])
31 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000032 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000033 # Check that the condition stated in the documentation for
34 # IncrementalDecoder.getstate() holds
35 if not state[1]:
36 # reset decoder to the default state without anything buffered
37 d.setstate((state[0][:0], 0))
38 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000039 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000040 # The decoder must return to the same state
41 self.assertEqual(state, d.getstate())
42 # Create a new decoder and set it to the state
43 # we extracted from the old one
44 d = codecs.getincrementaldecoder(encoding)()
45 d.setstate(state)
46 part2 = d.decode(s[i:], True)
47 self.assertEqual(u, part1+part2)
48
49 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000050 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000051 d = codecs.getincrementalencoder(encoding)()
52 part1 = d.encode(u[:i])
53 state = d.getstate()
54 d = codecs.getincrementalencoder(encoding)()
55 d.setstate(state)
56 part2 = d.encode(u[i:], True)
57 self.assertEqual(s, part1+part2)
58
59class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000060 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000061 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000062 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000063 # the StreamReader and check that the results equal the appropriate
64 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000065 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +000066 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000067 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000068 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000069 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000070 result += r.read()
71 self.assertEqual(result, partialresult)
72 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000073 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000074 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000075
Thomas Woutersa9773292006-04-21 09:43:23 +000076 # do the check again, this time using a incremental decoder
77 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000078 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000079 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000080 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000081 self.assertEqual(result, partialresult)
82 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000083 self.assertEqual(d.decode(b"", True), "")
84 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000085
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000086 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000087 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000088 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000089 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000090 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000091 self.assertEqual(result, partialresult)
92 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000093 self.assertEqual(d.decode(b"", True), "")
94 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000095
96 # check iterdecode()
97 encoded = input.encode(self.encoding)
98 self.assertEqual(
99 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000100 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000101 )
102
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000103 def test_readline(self):
104 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000105 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000106 return codecs.getreader(self.encoding)(stream)
107
Walter Dörwaldca199432006-03-06 22:39:12 +0000108 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000109 reader = getreader(input)
110 lines = []
111 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000112 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000113 if not line:
114 break
115 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000116 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000117
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000118 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
119 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
120 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000121 self.assertEqual(readalllines(s, True), sexpected)
122 self.assertEqual(readalllines(s, False), sexpectednoends)
123 self.assertEqual(readalllines(s, True, 10), sexpected)
124 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000125
126 # Test long lines (multiple calls to read() in readline())
127 vw = []
128 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000129 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
130 vw.append((i*200)*"\3042" + lineend)
131 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000132 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
133 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
134
135 # Test lines where the first read might end with \r, so the
136 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000137 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000138 for lineend in "\n \r\n \r \u2028".split():
139 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000140 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000141 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000142 self.assertEqual(
143 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000144 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000145 )
146 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000147 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000148 self.assertEqual(
149 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000150 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000151 )
152
153 def test_bug1175396(self):
154 s = [
155 '<%!--===================================================\r\n',
156 ' BLOG index page: show recent articles,\r\n',
157 ' today\'s articles, or articles of a specific date.\r\n',
158 '========================================================--%>\r\n',
159 '<%@inputencoding="ISO-8859-1"%>\r\n',
160 '<%@pagetemplate=TEMPLATE.y%>\r\n',
161 '<%@import=import frog.util, frog%>\r\n',
162 '<%@import=import frog.objects%>\r\n',
163 '<%@import=from frog.storageerrors import StorageError%>\r\n',
164 '<%\r\n',
165 '\r\n',
166 'import logging\r\n',
167 'log=logging.getLogger("Snakelets.logger")\r\n',
168 '\r\n',
169 '\r\n',
170 'user=self.SessionCtx.user\r\n',
171 'storageEngine=self.SessionCtx.storageEngine\r\n',
172 '\r\n',
173 '\r\n',
174 'def readArticlesFromDate(date, count=None):\r\n',
175 ' entryids=storageEngine.listBlogEntries(date)\r\n',
176 ' entryids.reverse() # descending\r\n',
177 ' if count:\r\n',
178 ' entryids=entryids[:count]\r\n',
179 ' try:\r\n',
180 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
181 ' except StorageError,x:\r\n',
182 ' log.error("Error loading articles: "+str(x))\r\n',
183 ' self.abort("cannot load articles")\r\n',
184 '\r\n',
185 'showdate=None\r\n',
186 '\r\n',
187 'arg=self.Request.getArg()\r\n',
188 'if arg=="today":\r\n',
189 ' #-------------------- TODAY\'S ARTICLES\r\n',
190 ' self.write("<h2>Today\'s articles</h2>")\r\n',
191 ' showdate = frog.util.isodatestr() \r\n',
192 ' entries = readArticlesFromDate(showdate)\r\n',
193 'elif arg=="active":\r\n',
194 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
195 ' self.Yredirect("active.y")\r\n',
196 'elif arg=="login":\r\n',
197 ' #-------------------- LOGIN PAGE redirect\r\n',
198 ' self.Yredirect("login.y")\r\n',
199 'elif arg=="date":\r\n',
200 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
201 ' showdate = self.Request.getParameter("date")\r\n',
202 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
203 ' entries = readArticlesFromDate(showdate)\r\n',
204 'else:\r\n',
205 ' #-------------------- RECENT ARTICLES\r\n',
206 ' self.write("<h2>Recent articles</h2>")\r\n',
207 ' dates=storageEngine.listBlogEntryDates()\r\n',
208 ' if dates:\r\n',
209 ' entries=[]\r\n',
210 ' SHOWAMOUNT=10\r\n',
211 ' for showdate in dates:\r\n',
212 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
213 ' if len(entries)>=SHOWAMOUNT:\r\n',
214 ' break\r\n',
215 ' \r\n',
216 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000217 stream = io.BytesIO("".join(s).encode(self.encoding))
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000218 reader = codecs.getreader(self.encoding)(stream)
219 for (i, line) in enumerate(reader):
220 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000221
222 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000223 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000224 writer = codecs.getwriter(self.encoding)(q)
225 reader = codecs.getreader(self.encoding)(q)
226
227 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000228 writer.write("foo\r")
229 self.assertEqual(reader.readline(keepends=False), "foo")
230 writer.write("\nbar\r")
231 self.assertEqual(reader.readline(keepends=False), "")
232 self.assertEqual(reader.readline(keepends=False), "bar")
233 writer.write("baz")
234 self.assertEqual(reader.readline(keepends=False), "baz")
235 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000236
237 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000238 writer.write("foo\r")
239 self.assertEqual(reader.readline(keepends=True), "foo\r")
240 writer.write("\nbar\r")
241 self.assertEqual(reader.readline(keepends=True), "\n")
242 self.assertEqual(reader.readline(keepends=True), "bar\r")
243 writer.write("baz")
244 self.assertEqual(reader.readline(keepends=True), "baz")
245 self.assertEqual(reader.readline(keepends=True), "")
246 writer.write("foo\r\n")
247 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000248
Walter Dörwald9fa09462005-01-10 12:01:39 +0000249 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000250 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
251 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
252 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000253
254 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000255 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000256 reader = codecs.getreader(self.encoding)(stream)
257 self.assertEqual(reader.readline(), s1)
258 self.assertEqual(reader.readline(), s2)
259 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000260 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000261
262 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000263 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
264 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
265 s3 = "stillokay:bbbbxx\r\n"
266 s4 = "broken!!!!badbad\r\n"
267 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000268
269 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000270 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000271 reader = codecs.getreader(self.encoding)(stream)
272 self.assertEqual(reader.readline(), s1)
273 self.assertEqual(reader.readline(), s2)
274 self.assertEqual(reader.readline(), s3)
275 self.assertEqual(reader.readline(), s4)
276 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000277 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000278
Walter Dörwald41980ca2007-08-16 21:55:45 +0000279class UTF32Test(ReadTest):
280 encoding = "utf-32"
281
282 spamle = (b'\xff\xfe\x00\x00'
283 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
284 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
285 spambe = (b'\x00\x00\xfe\xff'
286 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
287 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
288
289 def test_only_one_bom(self):
290 _,_,reader,writer = codecs.lookup(self.encoding)
291 # encode some stream
292 s = io.BytesIO()
293 f = writer(s)
294 f.write("spam")
295 f.write("spam")
296 d = s.getvalue()
297 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000298 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000299 # try to read it back
300 s = io.BytesIO(d)
301 f = reader(s)
302 self.assertEquals(f.read(), "spamspam")
303
304 def test_badbom(self):
305 s = io.BytesIO(4*b"\xff")
306 f = codecs.getreader(self.encoding)(s)
307 self.assertRaises(UnicodeError, f.read)
308
309 s = io.BytesIO(8*b"\xff")
310 f = codecs.getreader(self.encoding)(s)
311 self.assertRaises(UnicodeError, f.read)
312
313 def test_partial(self):
314 self.check_partial(
315 "\x00\xff\u0100\uffff",
316 [
317 "", # first byte of BOM read
318 "", # second byte of BOM read
319 "", # third byte of BOM read
320 "", # fourth byte of BOM read => byteorder known
321 "",
322 "",
323 "",
324 "\x00",
325 "\x00",
326 "\x00",
327 "\x00",
328 "\x00\xff",
329 "\x00\xff",
330 "\x00\xff",
331 "\x00\xff",
332 "\x00\xff\u0100",
333 "\x00\xff\u0100",
334 "\x00\xff\u0100",
335 "\x00\xff\u0100",
336 "\x00\xff\u0100\uffff",
337 ]
338 )
339
Georg Brandl791f4e12009-09-17 11:41:24 +0000340 def test_handlers(self):
341 self.assertEqual(('\ufffd', 1),
342 codecs.utf_32_decode(b'\x01', 'replace', True))
343 self.assertEqual(('', 1),
344 codecs.utf_32_decode(b'\x01', 'ignore', True))
345
Walter Dörwald41980ca2007-08-16 21:55:45 +0000346 def test_errors(self):
347 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
348 b"\xff", "strict", True)
349
350 def test_decoder_state(self):
351 self.check_state_handling_decode(self.encoding,
352 "spamspam", self.spamle)
353 self.check_state_handling_decode(self.encoding,
354 "spamspam", self.spambe)
355
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000356 def test_issue8941(self):
357 # Issue #8941: insufficient result allocation when decoding into
358 # surrogate pairs on UCS-2 builds.
359 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
360 self.assertEqual('\U00010000' * 1024,
361 codecs.utf_32_decode(encoded_le)[0])
362 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
363 self.assertEqual('\U00010000' * 1024,
364 codecs.utf_32_decode(encoded_be)[0])
365
Walter Dörwald41980ca2007-08-16 21:55:45 +0000366class UTF32LETest(ReadTest):
367 encoding = "utf-32-le"
368
369 def test_partial(self):
370 self.check_partial(
371 "\x00\xff\u0100\uffff",
372 [
373 "",
374 "",
375 "",
376 "\x00",
377 "\x00",
378 "\x00",
379 "\x00",
380 "\x00\xff",
381 "\x00\xff",
382 "\x00\xff",
383 "\x00\xff",
384 "\x00\xff\u0100",
385 "\x00\xff\u0100",
386 "\x00\xff\u0100",
387 "\x00\xff\u0100",
388 "\x00\xff\u0100\uffff",
389 ]
390 )
391
392 def test_simple(self):
393 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
394
395 def test_errors(self):
396 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
397 b"\xff", "strict", True)
398
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000399 def test_issue8941(self):
400 # Issue #8941: insufficient result allocation when decoding into
401 # surrogate pairs on UCS-2 builds.
402 encoded = b'\x00\x00\x01\x00' * 1024
403 self.assertEqual('\U00010000' * 1024,
404 codecs.utf_32_le_decode(encoded)[0])
405
Walter Dörwald41980ca2007-08-16 21:55:45 +0000406class UTF32BETest(ReadTest):
407 encoding = "utf-32-be"
408
409 def test_partial(self):
410 self.check_partial(
411 "\x00\xff\u0100\uffff",
412 [
413 "",
414 "",
415 "",
416 "\x00",
417 "\x00",
418 "\x00",
419 "\x00",
420 "\x00\xff",
421 "\x00\xff",
422 "\x00\xff",
423 "\x00\xff",
424 "\x00\xff\u0100",
425 "\x00\xff\u0100",
426 "\x00\xff\u0100",
427 "\x00\xff\u0100",
428 "\x00\xff\u0100\uffff",
429 ]
430 )
431
432 def test_simple(self):
433 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
434
435 def test_errors(self):
436 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
437 b"\xff", "strict", True)
438
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000439 def test_issue8941(self):
440 # Issue #8941: insufficient result allocation when decoding into
441 # surrogate pairs on UCS-2 builds.
442 encoded = b'\x00\x01\x00\x00' * 1024
443 self.assertEqual('\U00010000' * 1024,
444 codecs.utf_32_be_decode(encoded)[0])
445
446
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000447class UTF16Test(ReadTest):
448 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000449
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000450 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
451 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000452
453 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000454 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000455 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000456 s = io.BytesIO()
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000457 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000458 f.write("spam")
459 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000460 d = s.getvalue()
461 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000462 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000463 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000464 s = io.BytesIO(d)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000465 f = reader(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000466 self.assertEquals(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000467
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000468 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000469 s = io.BytesIO(b"\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000470 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000471 self.assertRaises(UnicodeError, f.read)
472
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000473 s = io.BytesIO(b"\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000474 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000475 self.assertRaises(UnicodeError, f.read)
476
Walter Dörwald69652032004-09-07 20:24:22 +0000477 def test_partial(self):
478 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000479 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000480 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000481 "", # first byte of BOM read
482 "", # second byte of BOM read => byteorder known
483 "",
484 "\x00",
485 "\x00",
486 "\x00\xff",
487 "\x00\xff",
488 "\x00\xff\u0100",
489 "\x00\xff\u0100",
490 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000491 ]
492 )
493
Georg Brandl791f4e12009-09-17 11:41:24 +0000494 def test_handlers(self):
495 self.assertEqual(('\ufffd', 1),
496 codecs.utf_16_decode(b'\x01', 'replace', True))
497 self.assertEqual(('', 1),
498 codecs.utf_16_decode(b'\x01', 'ignore', True))
499
Walter Dörwalde22d3392005-11-17 08:52:34 +0000500 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000501 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000502 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000503
504 def test_decoder_state(self):
505 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000506 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000507 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000508 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000509
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000510 def test_bug691291(self):
511 # Files are always opened in binary mode, even if no binary mode was
512 # specified. This means that no automatic conversion of '\n' is done
513 # on reading and writing.
514 s1 = 'Hello\r\nworld\r\n'
515
516 s = s1.encode(self.encoding)
517 try:
518 with open(support.TESTFN, 'wb') as fp:
519 fp.write(s)
520 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
521 self.assertEqual(reader.read(), s1)
522 finally:
523 support.unlink(support.TESTFN)
524
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000525class UTF16LETest(ReadTest):
526 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000527
528 def test_partial(self):
529 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000530 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000531 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000532 "",
533 "\x00",
534 "\x00",
535 "\x00\xff",
536 "\x00\xff",
537 "\x00\xff\u0100",
538 "\x00\xff\u0100",
539 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000540 ]
541 )
542
Walter Dörwalde22d3392005-11-17 08:52:34 +0000543 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000544 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000545 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000546
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000547class UTF16BETest(ReadTest):
548 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000549
550 def test_partial(self):
551 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000552 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000553 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000554 "",
555 "\x00",
556 "\x00",
557 "\x00\xff",
558 "\x00\xff",
559 "\x00\xff\u0100",
560 "\x00\xff\u0100",
561 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000562 ]
563 )
564
Walter Dörwalde22d3392005-11-17 08:52:34 +0000565 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000566 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000567 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000568
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000569class UTF8Test(ReadTest):
570 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000571
572 def test_partial(self):
573 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000574 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000575 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000576 "\x00",
577 "\x00",
578 "\x00\xff",
579 "\x00\xff",
580 "\x00\xff\u07ff",
581 "\x00\xff\u07ff",
582 "\x00\xff\u07ff",
583 "\x00\xff\u07ff\u0800",
584 "\x00\xff\u07ff\u0800",
585 "\x00\xff\u07ff\u0800",
586 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000587 ]
588 )
589
Walter Dörwald3abcb012007-04-16 22:10:50 +0000590 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000591 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000592 self.check_state_handling_decode(self.encoding,
593 u, u.encode(self.encoding))
594
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000595 def test_lone_surrogates(self):
596 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
597 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000598 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
599 b'[\\udc80]')
600 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
601 b'[&#56448;]')
602 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
603 b'[\x80]')
604 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
605 b'[]')
606 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
607 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000608
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000609 def test_surrogatepass_handler(self):
610 self.assertEquals("abc\ud800def".encode("utf-8", "surrogatepass"),
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000611 b"abc\xed\xa0\x80def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000612 self.assertEquals(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000613 "abc\ud800def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000614 self.assertTrue(codecs.lookup_error("surrogatepass"))
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000615
Walter Dörwalde22d3392005-11-17 08:52:34 +0000616class UTF7Test(ReadTest):
617 encoding = "utf-7"
618
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000619 def test_partial(self):
620 self.check_partial(
621 "a+-b",
622 [
623 "a",
624 "a",
625 "a+",
626 "a+-",
627 "a+-b",
628 ]
629 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000630
631class UTF16ExTest(unittest.TestCase):
632
633 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000634 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000635
636 def test_bad_args(self):
637 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
638
639class ReadBufferTest(unittest.TestCase):
640
641 def test_array(self):
642 import array
643 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000644 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000645 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000646 )
647
648 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000649 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000650
651 def test_bad_args(self):
652 self.assertRaises(TypeError, codecs.readbuffer_encode)
653 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
654
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000655class UTF8SigTest(ReadTest):
656 encoding = "utf-8-sig"
657
658 def test_partial(self):
659 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000660 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000661 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000662 "",
663 "",
664 "", # First BOM has been read and skipped
665 "",
666 "",
667 "\ufeff", # Second BOM has been read and emitted
668 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000669 "\ufeff\x00", # First byte of encoded "\xff" read
670 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
671 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
672 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000673 "\ufeff\x00\xff\u07ff",
674 "\ufeff\x00\xff\u07ff",
675 "\ufeff\x00\xff\u07ff\u0800",
676 "\ufeff\x00\xff\u07ff\u0800",
677 "\ufeff\x00\xff\u07ff\u0800",
678 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000679 ]
680 )
681
Thomas Wouters89f507f2006-12-13 04:49:30 +0000682 def test_bug1601501(self):
683 # SF bug #1601501: check that the codec works with a buffer
Antoine Pitrou616d2852008-08-19 22:09:34 +0000684 self.assertEquals(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000685
Walter Dörwald3abcb012007-04-16 22:10:50 +0000686 def test_bom(self):
687 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000688 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000689 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
690
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000691 def test_stream_bom(self):
692 unistring = "ABC\u00A1\u2200XYZ"
693 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
694
695 reader = codecs.getreader("utf-8-sig")
696 for sizehint in [None] + list(range(1, 11)) + \
697 [64, 128, 256, 512, 1024]:
698 istream = reader(io.BytesIO(bytestring))
699 ostream = io.StringIO()
700 while 1:
701 if sizehint is not None:
702 data = istream.read(sizehint)
703 else:
704 data = istream.read()
705
706 if not data:
707 break
708 ostream.write(data)
709
710 got = ostream.getvalue()
711 self.assertEqual(got, unistring)
712
713 def test_stream_bare(self):
714 unistring = "ABC\u00A1\u2200XYZ"
715 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
716
717 reader = codecs.getreader("utf-8-sig")
718 for sizehint in [None] + list(range(1, 11)) + \
719 [64, 128, 256, 512, 1024]:
720 istream = reader(io.BytesIO(bytestring))
721 ostream = io.StringIO()
722 while 1:
723 if sizehint is not None:
724 data = istream.read(sizehint)
725 else:
726 data = istream.read()
727
728 if not data:
729 break
730 ostream.write(data)
731
732 got = ostream.getvalue()
733 self.assertEqual(got, unistring)
734
735class EscapeDecodeTest(unittest.TestCase):
736 def test_empty(self):
737 self.assertEquals(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000738
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000739class RecodingTest(unittest.TestCase):
740 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000741 f = io.BytesIO()
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000742 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000743 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000744 f2.close()
745 # Python used to crash on this at exit because of a refcount
746 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000747
Martin v. Löwis2548c732003-04-18 10:39:54 +0000748# From RFC 3492
749punycode_testcases = [
750 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000751 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
752 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000753 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000754 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000755 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000756 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000757 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000758 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000759 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000760 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000761 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
762 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
763 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000764 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000765 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000766 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
767 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
768 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000769 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000770 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000771 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000772 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
773 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
774 "\u0939\u0948\u0902",
775 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000776
777 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000778 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000779 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
780 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000781
782 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000783 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
784 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
785 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000786 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
787 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000788
789 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000790 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
791 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
792 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
793 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000794 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000795
796 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000797 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
798 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
799 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
800 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
801 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000802 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000803
804 # (K) Vietnamese:
805 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
806 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000807 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
808 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
809 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
810 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000811 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000812
Martin v. Löwis2548c732003-04-18 10:39:54 +0000813 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000814 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000815 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000816
Martin v. Löwis2548c732003-04-18 10:39:54 +0000817 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000818 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
819 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
820 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000821 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000822
823 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000824 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
825 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
826 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000827 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000828
829 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000830 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000831 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000832
833 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000834 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
835 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000836 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000837
838 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000839 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000840 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000841
842 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000843 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000844 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000845
846 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000847 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
848 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000849 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000850 ]
851
852for i in punycode_testcases:
853 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000854 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000855
856class PunycodeTest(unittest.TestCase):
857 def test_encode(self):
858 for uni, puny in punycode_testcases:
859 # Need to convert both strings to lower case, since
860 # some of the extended encodings use upper case, but our
861 # code produces only lower case. Converting just puny to
862 # lower is also insufficient, since some of the input characters
863 # are upper case.
Walter Dörwalda4c61282007-05-10 12:36:25 +0000864 self.assertEquals(
865 str(uni.encode("punycode"), "ascii").lower(),
866 str(puny, "ascii").lower()
867 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000868
869 def test_decode(self):
870 for uni, puny in punycode_testcases:
871 self.assertEquals(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000872 puny = puny.decode("ascii").encode("ascii")
873 self.assertEquals(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000874
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000875class UnicodeInternalTest(unittest.TestCase):
876 def test_bug1251300(self):
877 # Decoding with unicode_internal used to not correctly handle "code
878 # points" above 0x10ffff on UCS-4 builds.
879 if sys.maxunicode > 0xffff:
880 ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000881 (b"\x00\x10\xff\xff", "\U0010ffff"),
882 (b"\x00\x00\x01\x01", "\U00000101"),
883 (b"", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000884 ]
885 not_ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000886 b"\x7f\xff\xff\xff",
887 b"\x80\x00\x00\x00",
888 b"\x81\x00\x00\x00",
889 b"\x00",
890 b"\x00\x00\x00\x00\x00",
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000891 ]
892 for internal, uni in ok:
893 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000894 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000895 self.assertEquals(uni, internal.decode("unicode_internal"))
896 for internal in not_ok:
897 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000898 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000899 self.assertRaises(UnicodeDecodeError, internal.decode,
900 "unicode_internal")
901
902 def test_decode_error_attributes(self):
903 if sys.maxunicode > 0xffff:
904 try:
Walter Dörwald092a2252007-06-07 11:26:16 +0000905 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +0000906 except UnicodeDecodeError as ex:
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000907 self.assertEquals("unicode_internal", ex.encoding)
Walter Dörwald092a2252007-06-07 11:26:16 +0000908 self.assertEquals(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000909 self.assertEquals(4, ex.start)
910 self.assertEquals(8, ex.end)
911 else:
912 self.fail()
913
914 def test_decode_callback(self):
915 if sys.maxunicode > 0xffff:
916 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
917 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossum98297ee2007-11-06 21:34:58 +0000918 ab = "ab".encode("unicode_internal").decode()
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000919 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
920 "ascii"),
921 "UnicodeInternalTest")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000922 self.assertEquals(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000923
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000924 def test_encode_length(self):
925 # Issue 3739
926 encoder = codecs.getencoder("unicode_internal")
927 self.assertEquals(encoder("a")[1], 1)
928 self.assertEquals(encoder("\xe9\u0142")[1], 2)
929
Philip Jenvey66a1bd52010-04-05 03:05:24 +0000930 self.assertEquals(codecs.escape_encode(br'\x00')[1], 4)
931
Martin v. Löwis2548c732003-04-18 10:39:54 +0000932# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
933nameprep_tests = [
934 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000935 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
936 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
937 b'\xb8\x8f\xef\xbb\xbf',
938 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000939 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000940 (b'CAFE',
941 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000942 # 3.3 Case folding 8bit U+00DF (german sharp s).
943 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000944 (b'\xc3\x9f',
945 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000946 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000947 (b'\xc4\xb0',
948 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000949 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000950 (b'\xc5\x83\xcd\xba',
951 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000952 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
953 # XXX: skip this as it fails in UCS-2 mode
954 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
955 # 'telc\xe2\x88\x95kg\xcf\x83'),
956 (None, None),
957 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000958 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
959 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000960 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000961 (b'\xe1\xbe\xb7',
962 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000963 # 3.9 Self-reverting case folding U+01F0 and normalization.
964 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000965 (b'\xc7\xb0',
966 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000967 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000968 (b'\xce\x90',
969 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000970 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000971 (b'\xce\xb0',
972 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000973 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000974 (b'\xe1\xba\x96',
975 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000976 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000977 (b'\xe1\xbd\x96',
978 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000979 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000980 (b' ',
981 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000982 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000983 (b'\xc2\xa0',
984 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000985 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000986 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000987 None),
988 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000989 (b'\xe2\x80\x80',
990 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000991 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000992 (b'\xe2\x80\x8b',
993 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000994 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000995 (b'\xe3\x80\x80',
996 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000997 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000998 (b'\x10\x7f',
999 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001000 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001001 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001002 None),
1003 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001004 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001005 None),
1006 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001007 (b'\xef\xbb\xbf',
1008 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001009 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001010 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001011 None),
1012 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001013 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001014 None),
1015 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001016 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001017 None),
1018 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001019 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001020 None),
1021 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001022 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001023 None),
1024 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001025 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001026 None),
1027 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001028 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001029 None),
1030 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001031 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001032 None),
1033 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001034 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001035 None),
1036 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001037 (b'\xcd\x81',
1038 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001039 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001040 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001041 None),
1042 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001043 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001044 None),
1045 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001046 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001047 None),
1048 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001049 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001050 None),
1051 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001052 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001053 None),
1054 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001055 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001056 None),
1057 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001058 (b'foo\xef\xb9\xb6bar',
1059 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001060 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001061 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001062 None),
1063 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001064 (b'\xd8\xa71\xd8\xa8',
1065 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001066 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001067 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001068 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001069 # None),
1070 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001071 # 3.44 Larger test (shrinking).
1072 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001073 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1074 b'\xaa\xce\xb0\xe2\x80\x80',
1075 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001076 # 3.45 Larger test (expanding).
1077 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001078 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1079 b'\x80',
1080 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1081 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1082 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001083 ]
1084
1085
1086class NameprepTest(unittest.TestCase):
1087 def test_nameprep(self):
1088 from encodings.idna import nameprep
1089 for pos, (orig, prepped) in enumerate(nameprep_tests):
1090 if orig is None:
1091 # Skipped
1092 continue
1093 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001094 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001095 if prepped is None:
1096 # Input contains prohibited characters
1097 self.assertRaises(UnicodeError, nameprep, orig)
1098 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001099 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001100 try:
1101 self.assertEquals(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001102 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001103 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001104
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001105class IDNACodecTest(unittest.TestCase):
1106 def test_builtin_decode(self):
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001107 self.assertEquals(str(b"python.org", "idna"), "python.org")
1108 self.assertEquals(str(b"python.org.", "idna"), "python.org.")
1109 self.assertEquals(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1110 self.assertEquals(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001111
1112 def test_builtin_encode(self):
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001113 self.assertEquals("python.org".encode("idna"), b"python.org")
1114 self.assertEquals("python.org.".encode("idna"), b"python.org.")
1115 self.assertEquals("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1116 self.assertEquals("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001117
Martin v. Löwis8b595142005-08-25 11:03:38 +00001118 def test_stream(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001119 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001120 r.read(3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001121 self.assertEquals(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001122
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001123 def test_incremental_decode(self):
1124 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001125 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001126 "python.org"
1127 )
1128 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001129 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001130 "python.org."
1131 )
1132 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001133 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001134 "pyth\xf6n.org."
1135 )
1136 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001137 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001138 "pyth\xf6n.org."
1139 )
1140
1141 decoder = codecs.getincrementaldecoder("idna")()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001142 self.assertEquals(decoder.decode(b"xn--xam", ), "")
1143 self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1144 self.assertEquals(decoder.decode(b"rg"), "")
1145 self.assertEquals(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001146
1147 decoder.reset()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001148 self.assertEquals(decoder.decode(b"xn--xam", ), "")
1149 self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1150 self.assertEquals(decoder.decode(b"rg."), "org.")
1151 self.assertEquals(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001152
1153 def test_incremental_encode(self):
1154 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001155 b"".join(codecs.iterencode("python.org", "idna")),
1156 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001157 )
1158 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001159 b"".join(codecs.iterencode("python.org.", "idna")),
1160 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001161 )
1162 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001163 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1164 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001165 )
1166 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001167 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1168 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001169 )
1170
1171 encoder = codecs.getincrementalencoder("idna")()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001172 self.assertEquals(encoder.encode("\xe4x"), b"")
1173 self.assertEquals(encoder.encode("ample.org"), b"xn--xample-9ta.")
1174 self.assertEquals(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001175
1176 encoder.reset()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001177 self.assertEquals(encoder.encode("\xe4x"), b"")
1178 self.assertEquals(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1179 self.assertEquals(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001180
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001181class CodecsModuleTest(unittest.TestCase):
1182
1183 def test_decode(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001184 self.assertEquals(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001185 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001186 self.assertRaises(TypeError, codecs.decode)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001187 self.assertEquals(codecs.decode(b'abc'), 'abc')
1188 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001189
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001190 def test_encode(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001191 self.assertEquals(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001192 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001193 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001194 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001195 self.assertEquals(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001196 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001197
1198 def test_register(self):
1199 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001200 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001201
1202 def test_lookup(self):
1203 self.assertRaises(TypeError, codecs.lookup)
1204 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001205 self.assertRaises(LookupError, codecs.lookup, " ")
1206
1207 def test_getencoder(self):
1208 self.assertRaises(TypeError, codecs.getencoder)
1209 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1210
1211 def test_getdecoder(self):
1212 self.assertRaises(TypeError, codecs.getdecoder)
1213 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1214
1215 def test_getreader(self):
1216 self.assertRaises(TypeError, codecs.getreader)
1217 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1218
1219 def test_getwriter(self):
1220 self.assertRaises(TypeError, codecs.getwriter)
1221 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001222
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001223class StreamReaderTest(unittest.TestCase):
1224
1225 def setUp(self):
1226 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001227 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001228
1229 def test_readlines(self):
1230 f = self.reader(self.stream)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001231 self.assertEquals(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001232
Thomas Wouters89f507f2006-12-13 04:49:30 +00001233class EncodedFileTest(unittest.TestCase):
1234
1235 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001236 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001237 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001238 self.assertEquals(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001239
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001240 f = io.BytesIO()
Thomas Wouters89f507f2006-12-13 04:49:30 +00001241 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001242 ef.write(b'\xc3\xbc')
1243 self.assertEquals(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001244
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001245all_unicode_encodings = [
1246 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001247 "big5",
1248 "big5hkscs",
1249 "charmap",
1250 "cp037",
1251 "cp1006",
1252 "cp1026",
1253 "cp1140",
1254 "cp1250",
1255 "cp1251",
1256 "cp1252",
1257 "cp1253",
1258 "cp1254",
1259 "cp1255",
1260 "cp1256",
1261 "cp1257",
1262 "cp1258",
1263 "cp424",
1264 "cp437",
1265 "cp500",
1266 "cp737",
1267 "cp775",
1268 "cp850",
1269 "cp852",
1270 "cp855",
1271 "cp856",
1272 "cp857",
1273 "cp860",
1274 "cp861",
1275 "cp862",
1276 "cp863",
1277 "cp864",
1278 "cp865",
1279 "cp866",
1280 "cp869",
1281 "cp874",
1282 "cp875",
1283 "cp932",
1284 "cp949",
1285 "cp950",
1286 "euc_jis_2004",
1287 "euc_jisx0213",
1288 "euc_jp",
1289 "euc_kr",
1290 "gb18030",
1291 "gb2312",
1292 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001293 "hp_roman8",
1294 "hz",
1295 "idna",
1296 "iso2022_jp",
1297 "iso2022_jp_1",
1298 "iso2022_jp_2",
1299 "iso2022_jp_2004",
1300 "iso2022_jp_3",
1301 "iso2022_jp_ext",
1302 "iso2022_kr",
1303 "iso8859_1",
1304 "iso8859_10",
1305 "iso8859_11",
1306 "iso8859_13",
1307 "iso8859_14",
1308 "iso8859_15",
1309 "iso8859_16",
1310 "iso8859_2",
1311 "iso8859_3",
1312 "iso8859_4",
1313 "iso8859_5",
1314 "iso8859_6",
1315 "iso8859_7",
1316 "iso8859_8",
1317 "iso8859_9",
1318 "johab",
1319 "koi8_r",
1320 "koi8_u",
1321 "latin_1",
1322 "mac_cyrillic",
1323 "mac_greek",
1324 "mac_iceland",
1325 "mac_latin2",
1326 "mac_roman",
1327 "mac_turkish",
1328 "palmos",
1329 "ptcp154",
1330 "punycode",
1331 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001332 "shift_jis",
1333 "shift_jis_2004",
1334 "shift_jisx0213",
1335 "tis_620",
1336 "unicode_escape",
1337 "unicode_internal",
1338 "utf_16",
1339 "utf_16_be",
1340 "utf_16_le",
1341 "utf_7",
1342 "utf_8",
1343]
1344
1345if hasattr(codecs, "mbcs_encode"):
1346 all_unicode_encodings.append("mbcs")
1347
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001348# The following encoding is not tested, because it's not supposed
1349# to work:
1350# "undefined"
1351
1352# The following encodings don't work in stateful mode
1353broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001354 "punycode",
1355 "unicode_internal"
1356]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001357broken_incremental_coders = broken_unicode_with_streams + [
1358 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001359]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001360
1361# The following encodings only support "strict" mode
1362only_strict_mode = [
1363 "idna",
Thomas Wouters89f507f2006-12-13 04:49:30 +00001364]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001365
Walter Dörwald3abcb012007-04-16 22:10:50 +00001366class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001367 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001368 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001369 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001370 name = codecs.lookup(encoding).name
1371 if encoding.endswith("_codec"):
1372 name += "_codec"
1373 elif encoding == "latin_1":
1374 name = "latin_1"
1375 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001376 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001377 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001378 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001379 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1380
1381 if encoding not in broken_unicode_with_streams:
1382 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001383 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001384 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001385 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001386 for c in s:
1387 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001388 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001389 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001390 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001391 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001392 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001393 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001394 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001395 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001396 decodedresult += reader.read()
1397 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1398
Thomas Wouters89f507f2006-12-13 04:49:30 +00001399 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001400 # check incremental decoder/encoder (fetched via the Python
1401 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001402 try:
1403 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001404 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001405 except LookupError: # no IncrementalEncoder
1406 pass
1407 else:
1408 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001409 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001410 for c in s:
1411 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001412 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001413 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001414 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001415 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001416 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001417 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001418 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1419
1420 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001421 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001422 for c in s:
1423 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001424 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001425 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001426 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001427 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001428 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001429 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001430 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1431
1432 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001433 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001434 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1435
1436 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001437 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1438 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001439
Thomas Wouters89f507f2006-12-13 04:49:30 +00001440 if encoding not in only_strict_mode:
1441 # check incremental decoder/encoder with errors argument
1442 try:
1443 encoder = codecs.getincrementalencoder(encoding)("ignore")
1444 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1445 except LookupError: # no IncrementalEncoder
1446 pass
1447 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001448 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001449 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001450 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001451 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1452
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001453 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001454 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001455 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001456 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1457
Walter Dörwald729c31f2005-03-14 19:06:30 +00001458 def test_seek(self):
1459 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001460 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001461 for encoding in all_unicode_encodings:
1462 if encoding == "idna": # FIXME: See SF bug #1163178
1463 continue
1464 if encoding in broken_unicode_with_streams:
1465 continue
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001466 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001467 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001468 # Test that calling seek resets the internal codec state and buffers
1469 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001470 data = reader.read()
1471 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001472
Walter Dörwalde22d3392005-11-17 08:52:34 +00001473 def test_bad_decode_args(self):
1474 for encoding in all_unicode_encodings:
1475 decoder = codecs.getdecoder(encoding)
1476 self.assertRaises(TypeError, decoder)
1477 if encoding not in ("idna", "punycode"):
1478 self.assertRaises(TypeError, decoder, 42)
1479
1480 def test_bad_encode_args(self):
1481 for encoding in all_unicode_encodings:
1482 encoder = codecs.getencoder(encoding)
1483 self.assertRaises(TypeError, encoder)
1484
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001485 def test_encoding_map_type_initialized(self):
1486 from encodings import cp1140
1487 # This used to crash, we are only verifying there's no crash.
1488 table_type = type(cp1140.encoding_table)
1489 self.assertEqual(table_type, table_type)
1490
Walter Dörwald3abcb012007-04-16 22:10:50 +00001491 def test_decoder_state(self):
1492 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001493 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001494 for encoding in all_unicode_encodings:
1495 if encoding not in broken_incremental_coders:
1496 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1497 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1498
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001499class CharmapTest(unittest.TestCase):
1500 def test_decode_with_string_map(self):
1501 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001502 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001503 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001504 )
1505
1506 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001507 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001508 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001509 )
1510
1511 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001512 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001513 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001514 )
1515
1516 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001517 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001518 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001519 )
1520
1521 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001522 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001523 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001524 )
1525
Guido van Rossum805365e2007-05-07 22:24:25 +00001526 allbytes = bytes(range(256))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001527 self.assertEquals(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001528 codecs.charmap_decode(allbytes, "ignore", ""),
1529 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001530 )
1531
Thomas Wouters89f507f2006-12-13 04:49:30 +00001532class WithStmtTest(unittest.TestCase):
1533 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001534 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001535 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001536 self.assertEquals(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001537
1538 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001539 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001540 info = codecs.lookup("utf-8")
1541 with codecs.StreamReaderWriter(f, info.streamreader,
1542 info.streamwriter, 'strict') as srw:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001543 self.assertEquals(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001544
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001545class TypesTest(unittest.TestCase):
1546 def test_decode_unicode(self):
1547 # Most decoders don't accept unicode input
1548 decoders = [
1549 codecs.utf_7_decode,
1550 codecs.utf_8_decode,
1551 codecs.utf_16_le_decode,
1552 codecs.utf_16_be_decode,
1553 codecs.utf_16_ex_decode,
1554 codecs.utf_32_decode,
1555 codecs.utf_32_le_decode,
1556 codecs.utf_32_be_decode,
1557 codecs.utf_32_ex_decode,
1558 codecs.latin_1_decode,
1559 codecs.ascii_decode,
1560 codecs.charmap_decode,
1561 ]
1562 if hasattr(codecs, "mbcs_decode"):
1563 decoders.append(codecs.mbcs_decode)
1564 for decoder in decoders:
1565 self.assertRaises(TypeError, decoder, "xxx")
1566
1567 def test_unicode_escape(self):
1568 # Escape-decoding an unicode string is supported ang gives the same
1569 # result as decoding the equivalent ASCII bytes string.
1570 self.assertEquals(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1571 self.assertEquals(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1572 self.assertEquals(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1573 self.assertEquals(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1574
Martin v. Löwis43c57782009-05-10 08:15:24 +00001575class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001576
1577 def test_utf8(self):
1578 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001579 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001580 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001581 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001582 b"foo\x80bar")
1583 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001584 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001585 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001586 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001587 b"\xed\xb0\x80")
1588
1589 def test_ascii(self):
1590 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001591 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001592 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001593 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001594 b"foo\x80bar")
1595
1596 def test_charmap(self):
1597 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001598 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001599 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001600 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001601 b"foo\xa5bar")
1602
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001603 def test_latin1(self):
1604 # Issue6373
1605 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin1", "surrogateescape"),
1606 b"\xe4\xeb\xef\xf6\xfc")
1607
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001608
Victor Stinner3fed0872010-05-22 02:16:27 +00001609class BomTest(unittest.TestCase):
1610 def test_seek0(self):
1611 data = "1234567890"
1612 tests = ("utf-16",
1613 "utf-16-le",
1614 "utf-16-be",
1615 "utf-32",
1616 "utf-32-le",
1617 "utf-32-be")
1618 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001619 # Check if the BOM is written only once
1620 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001621 f.write(data)
1622 f.write(data)
1623 f.seek(0)
1624 self.assertEquals(f.read(), data * 2)
1625 f.seek(0)
1626 self.assertEquals(f.read(), data * 2)
1627
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001628 # Check that the BOM is written after a seek(0)
1629 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1630 f.write(data[0])
1631 self.assertNotEquals(f.tell(), 0)
1632 f.seek(0)
1633 f.write(data)
1634 f.seek(0)
1635 self.assertEquals(f.read(), data)
1636
1637 # (StreamWriter) Check that the BOM is written after a seek(0)
1638 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1639 f.writer.write(data[0])
1640 self.assertNotEquals(f.writer.tell(), 0)
1641 f.writer.seek(0)
1642 f.writer.write(data)
1643 f.seek(0)
1644 self.assertEquals(f.read(), data)
1645
1646 # Check that the BOM is not written after a seek() at a position
1647 # different than the start
1648 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1649 f.write(data)
1650 f.seek(f.tell())
1651 f.write(data)
1652 f.seek(0)
1653 self.assertEquals(f.read(), data * 2)
1654
1655 # (StreamWriter) Check that the BOM is not written after a seek()
1656 # at a position different than the start
1657 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1658 f.writer.write(data)
1659 f.writer.seek(f.writer.tell())
1660 f.writer.write(data)
1661 f.seek(0)
1662 self.assertEquals(f.read(), data * 2)
1663
Victor Stinner3fed0872010-05-22 02:16:27 +00001664
Fred Drake2e2be372001-09-20 21:33:42 +00001665def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001666 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00001667 UTF32Test,
1668 UTF32LETest,
1669 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001670 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001671 UTF16LETest,
1672 UTF16BETest,
1673 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001674 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001675 UTF7Test,
1676 UTF16ExTest,
1677 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001678 RecodingTest,
1679 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001680 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001681 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001682 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001683 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001684 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001685 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001686 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001687 CharmapTest,
1688 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001689 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001690 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00001691 BomTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001692 )
Fred Drake2e2be372001-09-20 21:33:42 +00001693
1694
1695if __name__ == "__main__":
1696 test_main()