blob: 5ee2d9ff18009858871b23911c47a553902d5887 [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Barry Warsaw04f357c2002-07-23 19:04:11 +00002import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00004import sys, _testcapi, io
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000010 def __init__(self, buffer):
11 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000012
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000019 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000020 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwald3abcb012007-04-16 22:10:50 +000026class MixInCheckStateHandling:
27 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000028 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000029 d = codecs.getincrementaldecoder(encoding)()
30 part1 = d.decode(s[:i])
31 state = d.getstate()
Georg Brandlab91fde2009-08-13 08:51:18 +000032 self.assertTrue(isinstance(state[1], int))
Walter Dörwald3abcb012007-04-16 22:10:50 +000033 # Check that the condition stated in the documentation for
34 # IncrementalDecoder.getstate() holds
35 if not state[1]:
36 # reset decoder to the default state without anything buffered
37 d.setstate((state[0][:0], 0))
38 # Feeding the previous input may not produce any output
Georg Brandlab91fde2009-08-13 08:51:18 +000039 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000040 # The decoder must return to the same state
41 self.assertEqual(state, d.getstate())
42 # Create a new decoder and set it to the state
43 # we extracted from the old one
44 d = codecs.getincrementaldecoder(encoding)()
45 d.setstate(state)
46 part2 = d.decode(s[i:], True)
47 self.assertEqual(u, part1+part2)
48
49 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000050 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000051 d = codecs.getincrementalencoder(encoding)()
52 part1 = d.encode(u[:i])
53 state = d.getstate()
54 d = codecs.getincrementalencoder(encoding)()
55 d.setstate(state)
56 part2 = d.encode(u[i:], True)
57 self.assertEqual(s, part1+part2)
58
59class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000060 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000061 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000062 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000063 # the StreamReader and check that the results equal the appropriate
64 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000065 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +000066 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000067 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000068 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000069 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000070 result += r.read()
71 self.assertEqual(result, partialresult)
72 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000073 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000074 self.assertEqual(r.bytebuffer, b"")
Guido van Rossumef87d6e2007-05-02 19:09:54 +000075 self.assertEqual(r.charbuffer, "")
Walter Dörwald69652032004-09-07 20:24:22 +000076
Thomas Woutersa9773292006-04-21 09:43:23 +000077 # do the check again, this time using a incremental decoder
78 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000079 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000080 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000081 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000082 self.assertEqual(result, partialresult)
83 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000084 self.assertEqual(d.decode(b"", True), "")
85 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000086
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000088 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000089 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000090 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000091 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000092 self.assertEqual(result, partialresult)
93 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000094 self.assertEqual(d.decode(b"", True), "")
95 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000096
97 # check iterdecode()
98 encoded = input.encode(self.encoding)
99 self.assertEqual(
100 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000102 )
103
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000104 def test_readline(self):
105 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000106 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000107 return codecs.getreader(self.encoding)(stream)
108
Walter Dörwaldca199432006-03-06 22:39:12 +0000109 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000110 reader = getreader(input)
111 lines = []
112 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000113 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000114 if not line:
115 break
116 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000117 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000118
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000119 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
120 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
121 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000122 self.assertEqual(readalllines(s, True), sexpected)
123 self.assertEqual(readalllines(s, False), sexpectednoends)
124 self.assertEqual(readalllines(s, True, 10), sexpected)
125 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000126
127 # Test long lines (multiple calls to read() in readline())
128 vw = []
129 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000130 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
131 vw.append((i*200)*"\3042" + lineend)
132 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000133 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
134 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
135
136 # Test lines where the first read might end with \r, so the
137 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000138 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000139 for lineend in "\n \r\n \r \u2028".split():
140 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000141 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000142 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000143 self.assertEqual(
144 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000145 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000146 )
147 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000148 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000149 self.assertEqual(
150 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000151 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000152 )
153
154 def test_bug1175396(self):
155 s = [
156 '<%!--===================================================\r\n',
157 ' BLOG index page: show recent articles,\r\n',
158 ' today\'s articles, or articles of a specific date.\r\n',
159 '========================================================--%>\r\n',
160 '<%@inputencoding="ISO-8859-1"%>\r\n',
161 '<%@pagetemplate=TEMPLATE.y%>\r\n',
162 '<%@import=import frog.util, frog%>\r\n',
163 '<%@import=import frog.objects%>\r\n',
164 '<%@import=from frog.storageerrors import StorageError%>\r\n',
165 '<%\r\n',
166 '\r\n',
167 'import logging\r\n',
168 'log=logging.getLogger("Snakelets.logger")\r\n',
169 '\r\n',
170 '\r\n',
171 'user=self.SessionCtx.user\r\n',
172 'storageEngine=self.SessionCtx.storageEngine\r\n',
173 '\r\n',
174 '\r\n',
175 'def readArticlesFromDate(date, count=None):\r\n',
176 ' entryids=storageEngine.listBlogEntries(date)\r\n',
177 ' entryids.reverse() # descending\r\n',
178 ' if count:\r\n',
179 ' entryids=entryids[:count]\r\n',
180 ' try:\r\n',
181 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
182 ' except StorageError,x:\r\n',
183 ' log.error("Error loading articles: "+str(x))\r\n',
184 ' self.abort("cannot load articles")\r\n',
185 '\r\n',
186 'showdate=None\r\n',
187 '\r\n',
188 'arg=self.Request.getArg()\r\n',
189 'if arg=="today":\r\n',
190 ' #-------------------- TODAY\'S ARTICLES\r\n',
191 ' self.write("<h2>Today\'s articles</h2>")\r\n',
192 ' showdate = frog.util.isodatestr() \r\n',
193 ' entries = readArticlesFromDate(showdate)\r\n',
194 'elif arg=="active":\r\n',
195 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
196 ' self.Yredirect("active.y")\r\n',
197 'elif arg=="login":\r\n',
198 ' #-------------------- LOGIN PAGE redirect\r\n',
199 ' self.Yredirect("login.y")\r\n',
200 'elif arg=="date":\r\n',
201 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
202 ' showdate = self.Request.getParameter("date")\r\n',
203 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
204 ' entries = readArticlesFromDate(showdate)\r\n',
205 'else:\r\n',
206 ' #-------------------- RECENT ARTICLES\r\n',
207 ' self.write("<h2>Recent articles</h2>")\r\n',
208 ' dates=storageEngine.listBlogEntryDates()\r\n',
209 ' if dates:\r\n',
210 ' entries=[]\r\n',
211 ' SHOWAMOUNT=10\r\n',
212 ' for showdate in dates:\r\n',
213 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
214 ' if len(entries)>=SHOWAMOUNT:\r\n',
215 ' break\r\n',
216 ' \r\n',
217 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000218 stream = io.BytesIO("".join(s).encode(self.encoding))
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000219 reader = codecs.getreader(self.encoding)(stream)
220 for (i, line) in enumerate(reader):
221 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000222
223 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000224 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000225 writer = codecs.getwriter(self.encoding)(q)
226 reader = codecs.getreader(self.encoding)(q)
227
228 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000229 writer.write("foo\r")
230 self.assertEqual(reader.readline(keepends=False), "foo")
231 writer.write("\nbar\r")
232 self.assertEqual(reader.readline(keepends=False), "")
233 self.assertEqual(reader.readline(keepends=False), "bar")
234 writer.write("baz")
235 self.assertEqual(reader.readline(keepends=False), "baz")
236 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000237
238 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000239 writer.write("foo\r")
240 self.assertEqual(reader.readline(keepends=True), "foo\r")
241 writer.write("\nbar\r")
242 self.assertEqual(reader.readline(keepends=True), "\n")
243 self.assertEqual(reader.readline(keepends=True), "bar\r")
244 writer.write("baz")
245 self.assertEqual(reader.readline(keepends=True), "baz")
246 self.assertEqual(reader.readline(keepends=True), "")
247 writer.write("foo\r\n")
248 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000249
Walter Dörwald9fa09462005-01-10 12:01:39 +0000250 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000251 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
252 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
253 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000254
255 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000256 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000257 reader = codecs.getreader(self.encoding)(stream)
258 self.assertEqual(reader.readline(), s1)
259 self.assertEqual(reader.readline(), s2)
260 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000261 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000262
263 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000264 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
265 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
266 s3 = "stillokay:bbbbxx\r\n"
267 s4 = "broken!!!!badbad\r\n"
268 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000269
270 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000271 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000272 reader = codecs.getreader(self.encoding)(stream)
273 self.assertEqual(reader.readline(), s1)
274 self.assertEqual(reader.readline(), s2)
275 self.assertEqual(reader.readline(), s3)
276 self.assertEqual(reader.readline(), s4)
277 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000278 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000279
Walter Dörwald41980ca2007-08-16 21:55:45 +0000280class UTF32Test(ReadTest):
281 encoding = "utf-32"
282
283 spamle = (b'\xff\xfe\x00\x00'
284 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
285 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
286 spambe = (b'\x00\x00\xfe\xff'
287 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
288 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
289
290 def test_only_one_bom(self):
291 _,_,reader,writer = codecs.lookup(self.encoding)
292 # encode some stream
293 s = io.BytesIO()
294 f = writer(s)
295 f.write("spam")
296 f.write("spam")
297 d = s.getvalue()
298 # check whether there is exactly one BOM in it
Georg Brandlab91fde2009-08-13 08:51:18 +0000299 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000300 # try to read it back
301 s = io.BytesIO(d)
302 f = reader(s)
303 self.assertEquals(f.read(), "spamspam")
304
305 def test_badbom(self):
306 s = io.BytesIO(4*b"\xff")
307 f = codecs.getreader(self.encoding)(s)
308 self.assertRaises(UnicodeError, f.read)
309
310 s = io.BytesIO(8*b"\xff")
311 f = codecs.getreader(self.encoding)(s)
312 self.assertRaises(UnicodeError, f.read)
313
314 def test_partial(self):
315 self.check_partial(
316 "\x00\xff\u0100\uffff",
317 [
318 "", # first byte of BOM read
319 "", # second byte of BOM read
320 "", # third byte of BOM read
321 "", # fourth byte of BOM read => byteorder known
322 "",
323 "",
324 "",
325 "\x00",
326 "\x00",
327 "\x00",
328 "\x00",
329 "\x00\xff",
330 "\x00\xff",
331 "\x00\xff",
332 "\x00\xff",
333 "\x00\xff\u0100",
334 "\x00\xff\u0100",
335 "\x00\xff\u0100",
336 "\x00\xff\u0100",
337 "\x00\xff\u0100\uffff",
338 ]
339 )
340
Georg Brandl7b10c9f2009-09-17 11:46:23 +0000341 def test_handlers(self):
342 self.assertEqual(('\ufffd', 1),
343 codecs.utf_32_decode(b'\x01', 'replace', True))
344 self.assertEqual(('', 1),
345 codecs.utf_32_decode(b'\x01', 'ignore', True))
346
Walter Dörwald41980ca2007-08-16 21:55:45 +0000347 def test_errors(self):
348 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
349 b"\xff", "strict", True)
350
351 def test_decoder_state(self):
352 self.check_state_handling_decode(self.encoding,
353 "spamspam", self.spamle)
354 self.check_state_handling_decode(self.encoding,
355 "spamspam", self.spambe)
356
Antoine Pitrou6107a682010-06-11 21:48:34 +0000357 def test_issue8941(self):
358 # Issue #8941: insufficient result allocation when decoding into
359 # surrogate pairs on UCS-2 builds.
360 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
361 self.assertEqual('\U00010000' * 1024,
362 codecs.utf_32_decode(encoded_le)[0])
363 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
364 self.assertEqual('\U00010000' * 1024,
365 codecs.utf_32_decode(encoded_be)[0])
366
Walter Dörwald41980ca2007-08-16 21:55:45 +0000367class UTF32LETest(ReadTest):
368 encoding = "utf-32-le"
369
370 def test_partial(self):
371 self.check_partial(
372 "\x00\xff\u0100\uffff",
373 [
374 "",
375 "",
376 "",
377 "\x00",
378 "\x00",
379 "\x00",
380 "\x00",
381 "\x00\xff",
382 "\x00\xff",
383 "\x00\xff",
384 "\x00\xff",
385 "\x00\xff\u0100",
386 "\x00\xff\u0100",
387 "\x00\xff\u0100",
388 "\x00\xff\u0100",
389 "\x00\xff\u0100\uffff",
390 ]
391 )
392
393 def test_simple(self):
394 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
395
396 def test_errors(self):
397 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
398 b"\xff", "strict", True)
399
Antoine Pitrou6107a682010-06-11 21:48:34 +0000400 def test_issue8941(self):
401 # Issue #8941: insufficient result allocation when decoding into
402 # surrogate pairs on UCS-2 builds.
403 encoded = b'\x00\x00\x01\x00' * 1024
404 self.assertEqual('\U00010000' * 1024,
405 codecs.utf_32_le_decode(encoded)[0])
406
Walter Dörwald41980ca2007-08-16 21:55:45 +0000407class UTF32BETest(ReadTest):
408 encoding = "utf-32-be"
409
410 def test_partial(self):
411 self.check_partial(
412 "\x00\xff\u0100\uffff",
413 [
414 "",
415 "",
416 "",
417 "\x00",
418 "\x00",
419 "\x00",
420 "\x00",
421 "\x00\xff",
422 "\x00\xff",
423 "\x00\xff",
424 "\x00\xff",
425 "\x00\xff\u0100",
426 "\x00\xff\u0100",
427 "\x00\xff\u0100",
428 "\x00\xff\u0100",
429 "\x00\xff\u0100\uffff",
430 ]
431 )
432
433 def test_simple(self):
434 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
435
436 def test_errors(self):
437 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
438 b"\xff", "strict", True)
439
Antoine Pitrou6107a682010-06-11 21:48:34 +0000440 def test_issue8941(self):
441 # Issue #8941: insufficient result allocation when decoding into
442 # surrogate pairs on UCS-2 builds.
443 encoded = b'\x00\x01\x00\x00' * 1024
444 self.assertEqual('\U00010000' * 1024,
445 codecs.utf_32_be_decode(encoded)[0])
446
447
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000448class UTF16Test(ReadTest):
449 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000450
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000451 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
452 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000453
454 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000455 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000456 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000457 s = io.BytesIO()
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000458 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000459 f.write("spam")
460 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000461 d = s.getvalue()
462 # check whether there is exactly one BOM in it
Georg Brandlab91fde2009-08-13 08:51:18 +0000463 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000464 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000465 s = io.BytesIO(d)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000466 f = reader(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000467 self.assertEquals(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000468
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000469 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000470 s = io.BytesIO(b"\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000471 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000472 self.assertRaises(UnicodeError, f.read)
473
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000474 s = io.BytesIO(b"\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000475 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000476 self.assertRaises(UnicodeError, f.read)
477
Walter Dörwald69652032004-09-07 20:24:22 +0000478 def test_partial(self):
479 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000480 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000481 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000482 "", # first byte of BOM read
483 "", # second byte of BOM read => byteorder known
484 "",
485 "\x00",
486 "\x00",
487 "\x00\xff",
488 "\x00\xff",
489 "\x00\xff\u0100",
490 "\x00\xff\u0100",
491 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000492 ]
493 )
494
Georg Brandl7b10c9f2009-09-17 11:46:23 +0000495 def test_handlers(self):
496 self.assertEqual(('\ufffd', 1),
497 codecs.utf_16_decode(b'\x01', 'replace', True))
498 self.assertEqual(('', 1),
499 codecs.utf_16_decode(b'\x01', 'ignore', True))
500
Walter Dörwalde22d3392005-11-17 08:52:34 +0000501 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000502 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000503 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000504
505 def test_decoder_state(self):
506 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000507 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000508 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000509 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000510
Florent Xiclunae36b2c62010-02-27 11:38:27 +0000511 def test_bug691291(self):
512 # Files are always opened in binary mode, even if no binary mode was
513 # specified. This means that no automatic conversion of '\n' is done
514 # on reading and writing.
515 s1 = 'Hello\r\nworld\r\n'
516
517 s = s1.encode(self.encoding)
518 try:
519 with open(support.TESTFN, 'wb') as fp:
520 fp.write(s)
521 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
522 self.assertEqual(reader.read(), s1)
523 finally:
524 support.unlink(support.TESTFN)
525
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000526class UTF16LETest(ReadTest):
527 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000528
529 def test_partial(self):
530 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000531 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000532 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000533 "",
534 "\x00",
535 "\x00",
536 "\x00\xff",
537 "\x00\xff",
538 "\x00\xff\u0100",
539 "\x00\xff\u0100",
540 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000541 ]
542 )
543
Walter Dörwalde22d3392005-11-17 08:52:34 +0000544 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000545 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000546 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000547
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000548class UTF16BETest(ReadTest):
549 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000550
551 def test_partial(self):
552 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000553 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000554 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000555 "",
556 "\x00",
557 "\x00",
558 "\x00\xff",
559 "\x00\xff",
560 "\x00\xff\u0100",
561 "\x00\xff\u0100",
562 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000563 ]
564 )
565
Walter Dörwalde22d3392005-11-17 08:52:34 +0000566 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000567 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000568 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000569
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000570class UTF8Test(ReadTest):
571 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000572
573 def test_partial(self):
574 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000575 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000576 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000577 "\x00",
578 "\x00",
579 "\x00\xff",
580 "\x00\xff",
581 "\x00\xff\u07ff",
582 "\x00\xff\u07ff",
583 "\x00\xff\u07ff",
584 "\x00\xff\u07ff\u0800",
585 "\x00\xff\u07ff\u0800",
586 "\x00\xff\u07ff\u0800",
587 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000588 ]
589 )
590
Walter Dörwald3abcb012007-04-16 22:10:50 +0000591 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000592 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000593 self.check_state_handling_decode(self.encoding,
594 u, u.encode(self.encoding))
595
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000596 def test_lone_surrogates(self):
597 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
598 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner158701d2010-04-22 19:41:01 +0000599 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
600 b'[\\udc80]')
601 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
602 b'[&#56448;]')
603 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
604 b'[\x80]')
605 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
606 b'[]')
607 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
608 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000609
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000610 def test_surrogatepass_handler(self):
611 self.assertEquals("abc\ud800def".encode("utf-8", "surrogatepass"),
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000612 b"abc\xed\xa0\x80def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000613 self.assertEquals(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000614 "abc\ud800def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000615 self.assertTrue(codecs.lookup_error("surrogatepass"))
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000616
Walter Dörwalde22d3392005-11-17 08:52:34 +0000617class UTF7Test(ReadTest):
618 encoding = "utf-7"
619
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000620 def test_partial(self):
621 self.check_partial(
622 "a+-b",
623 [
624 "a",
625 "a",
626 "a+",
627 "a+-",
628 "a+-b",
629 ]
630 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000631
632class UTF16ExTest(unittest.TestCase):
633
634 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000635 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000636
637 def test_bad_args(self):
638 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
639
640class ReadBufferTest(unittest.TestCase):
641
642 def test_array(self):
643 import array
644 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000645 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000646 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000647 )
648
649 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000650 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000651
652 def test_bad_args(self):
653 self.assertRaises(TypeError, codecs.readbuffer_encode)
654 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
655
656class CharBufferTest(unittest.TestCase):
657
658 def test_string(self):
Guido van Rossum09549f42007-08-27 20:40:10 +0000659 self.assertEqual(codecs.charbuffer_encode(b"spam"), (b"spam", 4))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000660
661 def test_empty(self):
Guido van Rossum09549f42007-08-27 20:40:10 +0000662 self.assertEqual(codecs.charbuffer_encode(b""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000663
664 def test_bad_args(self):
665 self.assertRaises(TypeError, codecs.charbuffer_encode)
666 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
667
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000668class UTF8SigTest(ReadTest):
669 encoding = "utf-8-sig"
670
671 def test_partial(self):
672 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000673 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000674 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000675 "",
676 "",
677 "", # First BOM has been read and skipped
678 "",
679 "",
680 "\ufeff", # Second BOM has been read and emitted
681 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000682 "\ufeff\x00", # First byte of encoded "\xff" read
683 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
684 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
685 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000686 "\ufeff\x00\xff\u07ff",
687 "\ufeff\x00\xff\u07ff",
688 "\ufeff\x00\xff\u07ff\u0800",
689 "\ufeff\x00\xff\u07ff\u0800",
690 "\ufeff\x00\xff\u07ff\u0800",
691 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000692 ]
693 )
694
Thomas Wouters89f507f2006-12-13 04:49:30 +0000695 def test_bug1601501(self):
696 # SF bug #1601501: check that the codec works with a buffer
Antoine Pitrou616d2852008-08-19 22:09:34 +0000697 self.assertEquals(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000698
Walter Dörwald3abcb012007-04-16 22:10:50 +0000699 def test_bom(self):
700 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000701 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000702 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
703
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000704 def test_stream_bom(self):
705 unistring = "ABC\u00A1\u2200XYZ"
706 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
707
708 reader = codecs.getreader("utf-8-sig")
709 for sizehint in [None] + list(range(1, 11)) + \
710 [64, 128, 256, 512, 1024]:
711 istream = reader(io.BytesIO(bytestring))
712 ostream = io.StringIO()
713 while 1:
714 if sizehint is not None:
715 data = istream.read(sizehint)
716 else:
717 data = istream.read()
718
719 if not data:
720 break
721 ostream.write(data)
722
723 got = ostream.getvalue()
724 self.assertEqual(got, unistring)
725
726 def test_stream_bare(self):
727 unistring = "ABC\u00A1\u2200XYZ"
728 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
729
730 reader = codecs.getreader("utf-8-sig")
731 for sizehint in [None] + list(range(1, 11)) + \
732 [64, 128, 256, 512, 1024]:
733 istream = reader(io.BytesIO(bytestring))
734 ostream = io.StringIO()
735 while 1:
736 if sizehint is not None:
737 data = istream.read(sizehint)
738 else:
739 data = istream.read()
740
741 if not data:
742 break
743 ostream.write(data)
744
745 got = ostream.getvalue()
746 self.assertEqual(got, unistring)
747
748class EscapeDecodeTest(unittest.TestCase):
749 def test_empty(self):
750 self.assertEquals(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000751
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000752class RecodingTest(unittest.TestCase):
753 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000754 f = io.BytesIO()
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000755 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000756 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000757 f2.close()
758 # Python used to crash on this at exit because of a refcount
759 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000760
Martin v. Löwis2548c732003-04-18 10:39:54 +0000761# From RFC 3492
762punycode_testcases = [
763 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000764 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
765 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000766 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000767 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000768 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000769 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000770 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000771 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000772 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000773 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000774 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
775 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
776 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000777 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000778 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000779 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
780 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
781 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000782 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000783 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000784 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000785 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
786 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
787 "\u0939\u0948\u0902",
788 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000789
790 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000791 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000792 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
793 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000794
795 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000796 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
797 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
798 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000799 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
800 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000801
802 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000803 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
804 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
805 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
806 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000807 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000808
809 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000810 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
811 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
812 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
813 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
814 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000815 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000816
817 # (K) Vietnamese:
818 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
819 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000820 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
821 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
822 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
823 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000824 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000825
Martin v. Löwis2548c732003-04-18 10:39:54 +0000826 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000827 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000828 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000829
Martin v. Löwis2548c732003-04-18 10:39:54 +0000830 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000831 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
832 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
833 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000834 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000835
836 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000837 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
838 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
839 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000840 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000841
842 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000843 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000844 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000845
846 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000847 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
848 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000849 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000850
851 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000852 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000853 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000854
855 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000856 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000857 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000858
859 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000860 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
861 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000862 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000863 ]
864
865for i in punycode_testcases:
866 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000867 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000868
869class PunycodeTest(unittest.TestCase):
870 def test_encode(self):
871 for uni, puny in punycode_testcases:
872 # Need to convert both strings to lower case, since
873 # some of the extended encodings use upper case, but our
874 # code produces only lower case. Converting just puny to
875 # lower is also insufficient, since some of the input characters
876 # are upper case.
Walter Dörwalda4c61282007-05-10 12:36:25 +0000877 self.assertEquals(
878 str(uni.encode("punycode"), "ascii").lower(),
879 str(puny, "ascii").lower()
880 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000881
882 def test_decode(self):
883 for uni, puny in punycode_testcases:
884 self.assertEquals(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000885 puny = puny.decode("ascii").encode("ascii")
886 self.assertEquals(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000887
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000888class UnicodeInternalTest(unittest.TestCase):
889 def test_bug1251300(self):
890 # Decoding with unicode_internal used to not correctly handle "code
891 # points" above 0x10ffff on UCS-4 builds.
892 if sys.maxunicode > 0xffff:
893 ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000894 (b"\x00\x10\xff\xff", "\U0010ffff"),
895 (b"\x00\x00\x01\x01", "\U00000101"),
896 (b"", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000897 ]
898 not_ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000899 b"\x7f\xff\xff\xff",
900 b"\x80\x00\x00\x00",
901 b"\x81\x00\x00\x00",
902 b"\x00",
903 b"\x00\x00\x00\x00\x00",
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000904 ]
905 for internal, uni in ok:
906 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000907 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000908 self.assertEquals(uni, internal.decode("unicode_internal"))
909 for internal in not_ok:
910 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000911 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000912 self.assertRaises(UnicodeDecodeError, internal.decode,
913 "unicode_internal")
914
915 def test_decode_error_attributes(self):
916 if sys.maxunicode > 0xffff:
917 try:
Walter Dörwald092a2252007-06-07 11:26:16 +0000918 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +0000919 except UnicodeDecodeError as ex:
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000920 self.assertEquals("unicode_internal", ex.encoding)
Walter Dörwald092a2252007-06-07 11:26:16 +0000921 self.assertEquals(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000922 self.assertEquals(4, ex.start)
923 self.assertEquals(8, ex.end)
924 else:
925 self.fail()
926
927 def test_decode_callback(self):
928 if sys.maxunicode > 0xffff:
929 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
930 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossum98297ee2007-11-06 21:34:58 +0000931 ab = "ab".encode("unicode_internal").decode()
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000932 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
933 "ascii"),
934 "UnicodeInternalTest")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000935 self.assertEquals(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000936
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000937 def test_encode_length(self):
938 # Issue 3739
939 encoder = codecs.getencoder("unicode_internal")
940 self.assertEquals(encoder("a")[1], 1)
941 self.assertEquals(encoder("\xe9\u0142")[1], 2)
942
Philip Jenveyddf0d032010-06-09 17:56:11 +0000943 self.assertEquals(codecs.escape_encode(br'\x00')[1], 4)
944
Martin v. Löwis2548c732003-04-18 10:39:54 +0000945# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
946nameprep_tests = [
947 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000948 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
949 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
950 b'\xb8\x8f\xef\xbb\xbf',
951 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000952 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000953 (b'CAFE',
954 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000955 # 3.3 Case folding 8bit U+00DF (german sharp s).
956 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000957 (b'\xc3\x9f',
958 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000959 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000960 (b'\xc4\xb0',
961 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000962 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000963 (b'\xc5\x83\xcd\xba',
964 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000965 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
966 # XXX: skip this as it fails in UCS-2 mode
967 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
968 # 'telc\xe2\x88\x95kg\xcf\x83'),
969 (None, None),
970 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000971 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
972 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000973 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000974 (b'\xe1\xbe\xb7',
975 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000976 # 3.9 Self-reverting case folding U+01F0 and normalization.
977 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000978 (b'\xc7\xb0',
979 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000980 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000981 (b'\xce\x90',
982 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000983 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000984 (b'\xce\xb0',
985 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000986 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000987 (b'\xe1\xba\x96',
988 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000989 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000990 (b'\xe1\xbd\x96',
991 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000992 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000993 (b' ',
994 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000995 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000996 (b'\xc2\xa0',
997 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000998 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000999 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001000 None),
1001 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001002 (b'\xe2\x80\x80',
1003 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001004 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001005 (b'\xe2\x80\x8b',
1006 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001007 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001008 (b'\xe3\x80\x80',
1009 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001010 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001011 (b'\x10\x7f',
1012 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001013 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001014 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001015 None),
1016 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001017 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001018 None),
1019 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001020 (b'\xef\xbb\xbf',
1021 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001022 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001023 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001024 None),
1025 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001026 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001027 None),
1028 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001029 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001030 None),
1031 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001032 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001033 None),
1034 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001035 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001036 None),
1037 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001038 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001039 None),
1040 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001041 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001042 None),
1043 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001044 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001045 None),
1046 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001047 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001048 None),
1049 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001050 (b'\xcd\x81',
1051 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001052 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001053 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001054 None),
1055 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001056 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001057 None),
1058 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001059 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001060 None),
1061 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001062 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001063 None),
1064 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001065 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001066 None),
1067 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001068 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001069 None),
1070 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001071 (b'foo\xef\xb9\xb6bar',
1072 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001073 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001074 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001075 None),
1076 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001077 (b'\xd8\xa71\xd8\xa8',
1078 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001079 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001080 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001081 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001082 # None),
1083 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001084 # 3.44 Larger test (shrinking).
1085 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001086 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1087 b'\xaa\xce\xb0\xe2\x80\x80',
1088 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001089 # 3.45 Larger test (expanding).
1090 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001091 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1092 b'\x80',
1093 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1094 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1095 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001096 ]
1097
1098
1099class NameprepTest(unittest.TestCase):
1100 def test_nameprep(self):
1101 from encodings.idna import nameprep
1102 for pos, (orig, prepped) in enumerate(nameprep_tests):
1103 if orig is None:
1104 # Skipped
1105 continue
1106 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001107 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001108 if prepped is None:
1109 # Input contains prohibited characters
1110 self.assertRaises(UnicodeError, nameprep, orig)
1111 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001112 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001113 try:
1114 self.assertEquals(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001115 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001116 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001117
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001118class IDNACodecTest(unittest.TestCase):
1119 def test_builtin_decode(self):
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001120 self.assertEquals(str(b"python.org", "idna"), "python.org")
1121 self.assertEquals(str(b"python.org.", "idna"), "python.org.")
1122 self.assertEquals(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1123 self.assertEquals(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001124
1125 def test_builtin_encode(self):
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001126 self.assertEquals("python.org".encode("idna"), b"python.org")
1127 self.assertEquals("python.org.".encode("idna"), b"python.org.")
1128 self.assertEquals("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1129 self.assertEquals("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001130
Martin v. Löwis8b595142005-08-25 11:03:38 +00001131 def test_stream(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001132 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001133 r.read(3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001134 self.assertEquals(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001135
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001136 def test_incremental_decode(self):
1137 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001138 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001139 "python.org"
1140 )
1141 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001142 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001143 "python.org."
1144 )
1145 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001146 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001147 "pyth\xf6n.org."
1148 )
1149 self.assertEquals(
Guido van Rossum09549f42007-08-27 20:40:10 +00001150 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001151 "pyth\xf6n.org."
1152 )
1153
1154 decoder = codecs.getincrementaldecoder("idna")()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001155 self.assertEquals(decoder.decode(b"xn--xam", ), "")
1156 self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1157 self.assertEquals(decoder.decode(b"rg"), "")
1158 self.assertEquals(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001159
1160 decoder.reset()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001161 self.assertEquals(decoder.decode(b"xn--xam", ), "")
1162 self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1163 self.assertEquals(decoder.decode(b"rg."), "org.")
1164 self.assertEquals(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001165
1166 def test_incremental_encode(self):
1167 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001168 b"".join(codecs.iterencode("python.org", "idna")),
1169 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001170 )
1171 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001172 b"".join(codecs.iterencode("python.org.", "idna")),
1173 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001174 )
1175 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001176 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1177 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001178 )
1179 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001180 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1181 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001182 )
1183
1184 encoder = codecs.getincrementalencoder("idna")()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001185 self.assertEquals(encoder.encode("\xe4x"), b"")
1186 self.assertEquals(encoder.encode("ample.org"), b"xn--xample-9ta.")
1187 self.assertEquals(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001188
1189 encoder.reset()
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001190 self.assertEquals(encoder.encode("\xe4x"), b"")
1191 self.assertEquals(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1192 self.assertEquals(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001193
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001194class CodecsModuleTest(unittest.TestCase):
1195
1196 def test_decode(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001197 self.assertEquals(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001198 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001199 self.assertRaises(TypeError, codecs.decode)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001200 self.assertEquals(codecs.decode(b'abc'), 'abc')
1201 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001202
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001203 def test_encode(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001204 self.assertEquals(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001205 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001206 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001207 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001208 self.assertEquals(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001209 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001210
1211 def test_register(self):
1212 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001213 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001214
1215 def test_lookup(self):
1216 self.assertRaises(TypeError, codecs.lookup)
1217 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001218 self.assertRaises(LookupError, codecs.lookup, " ")
1219
1220 def test_getencoder(self):
1221 self.assertRaises(TypeError, codecs.getencoder)
1222 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1223
1224 def test_getdecoder(self):
1225 self.assertRaises(TypeError, codecs.getdecoder)
1226 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1227
1228 def test_getreader(self):
1229 self.assertRaises(TypeError, codecs.getreader)
1230 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1231
1232 def test_getwriter(self):
1233 self.assertRaises(TypeError, codecs.getwriter)
1234 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001235
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001236class StreamReaderTest(unittest.TestCase):
1237
1238 def setUp(self):
1239 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001240 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001241
1242 def test_readlines(self):
1243 f = self.reader(self.stream)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001244 self.assertEquals(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001245
Thomas Wouters89f507f2006-12-13 04:49:30 +00001246class EncodedFileTest(unittest.TestCase):
1247
1248 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001249 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001250 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001251 self.assertEquals(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001252
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001253 f = io.BytesIO()
Thomas Wouters89f507f2006-12-13 04:49:30 +00001254 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001255 ef.write(b'\xc3\xbc')
1256 self.assertEquals(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001257
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001258all_unicode_encodings = [
1259 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001260 "big5",
1261 "big5hkscs",
1262 "charmap",
1263 "cp037",
1264 "cp1006",
1265 "cp1026",
1266 "cp1140",
1267 "cp1250",
1268 "cp1251",
1269 "cp1252",
1270 "cp1253",
1271 "cp1254",
1272 "cp1255",
1273 "cp1256",
1274 "cp1257",
1275 "cp1258",
1276 "cp424",
1277 "cp437",
1278 "cp500",
1279 "cp737",
1280 "cp775",
1281 "cp850",
1282 "cp852",
1283 "cp855",
1284 "cp856",
1285 "cp857",
1286 "cp860",
1287 "cp861",
1288 "cp862",
1289 "cp863",
1290 "cp864",
1291 "cp865",
1292 "cp866",
1293 "cp869",
1294 "cp874",
1295 "cp875",
1296 "cp932",
1297 "cp949",
1298 "cp950",
1299 "euc_jis_2004",
1300 "euc_jisx0213",
1301 "euc_jp",
1302 "euc_kr",
1303 "gb18030",
1304 "gb2312",
1305 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001306 "hp_roman8",
1307 "hz",
1308 "idna",
1309 "iso2022_jp",
1310 "iso2022_jp_1",
1311 "iso2022_jp_2",
1312 "iso2022_jp_2004",
1313 "iso2022_jp_3",
1314 "iso2022_jp_ext",
1315 "iso2022_kr",
1316 "iso8859_1",
1317 "iso8859_10",
1318 "iso8859_11",
1319 "iso8859_13",
1320 "iso8859_14",
1321 "iso8859_15",
1322 "iso8859_16",
1323 "iso8859_2",
1324 "iso8859_3",
1325 "iso8859_4",
1326 "iso8859_5",
1327 "iso8859_6",
1328 "iso8859_7",
1329 "iso8859_8",
1330 "iso8859_9",
1331 "johab",
1332 "koi8_r",
1333 "koi8_u",
1334 "latin_1",
1335 "mac_cyrillic",
1336 "mac_greek",
1337 "mac_iceland",
1338 "mac_latin2",
1339 "mac_roman",
1340 "mac_turkish",
1341 "palmos",
1342 "ptcp154",
1343 "punycode",
1344 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001345 "shift_jis",
1346 "shift_jis_2004",
1347 "shift_jisx0213",
1348 "tis_620",
1349 "unicode_escape",
1350 "unicode_internal",
1351 "utf_16",
1352 "utf_16_be",
1353 "utf_16_le",
1354 "utf_7",
1355 "utf_8",
1356]
1357
1358if hasattr(codecs, "mbcs_encode"):
1359 all_unicode_encodings.append("mbcs")
1360
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001361# The following encoding is not tested, because it's not supposed
1362# to work:
1363# "undefined"
1364
1365# The following encodings don't work in stateful mode
1366broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001367 "punycode",
1368 "unicode_internal"
1369]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001370broken_incremental_coders = broken_unicode_with_streams + [
1371 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001372]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001373
1374# The following encodings only support "strict" mode
1375only_strict_mode = [
1376 "idna",
Thomas Wouters89f507f2006-12-13 04:49:30 +00001377]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001378
Walter Dörwald3abcb012007-04-16 22:10:50 +00001379class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001380 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001381 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001382 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001383 name = codecs.lookup(encoding).name
1384 if encoding.endswith("_codec"):
1385 name += "_codec"
1386 elif encoding == "latin_1":
1387 name = "latin_1"
1388 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001389 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001390 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001391 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001392 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1393
1394 if encoding not in broken_unicode_with_streams:
1395 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001396 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001397 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001398 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001399 for c in s:
1400 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001401 chunk = q.read()
Georg Brandlab91fde2009-08-13 08:51:18 +00001402 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001403 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001404 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001405 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001406 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001407 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001408 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001409 decodedresult += reader.read()
1410 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1411
Thomas Wouters89f507f2006-12-13 04:49:30 +00001412 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001413 # check incremental decoder/encoder (fetched via the Python
1414 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001415 try:
1416 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001417 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001418 except LookupError: # no IncrementalEncoder
1419 pass
1420 else:
1421 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001422 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001423 for c in s:
1424 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001425 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001426 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001427 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001428 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001429 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001430 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001431 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1432
1433 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001434 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001435 for c in s:
1436 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001437 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001438 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001439 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001440 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001441 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001442 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001443 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1444
1445 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001446 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001447 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1448
1449 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001450 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1451 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001452
Thomas Wouters89f507f2006-12-13 04:49:30 +00001453 if encoding not in only_strict_mode:
1454 # check incremental decoder/encoder with errors argument
1455 try:
1456 encoder = codecs.getincrementalencoder(encoding)("ignore")
1457 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1458 except LookupError: # no IncrementalEncoder
1459 pass
1460 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001461 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001462 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001463 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001464 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1465
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001466 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001467 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001468 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001469 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1470
Walter Dörwald729c31f2005-03-14 19:06:30 +00001471 def test_seek(self):
1472 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001473 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001474 for encoding in all_unicode_encodings:
1475 if encoding == "idna": # FIXME: See SF bug #1163178
1476 continue
1477 if encoding in broken_unicode_with_streams:
1478 continue
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001479 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001480 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001481 # Test that calling seek resets the internal codec state and buffers
1482 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001483 data = reader.read()
1484 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001485
Walter Dörwalde22d3392005-11-17 08:52:34 +00001486 def test_bad_decode_args(self):
1487 for encoding in all_unicode_encodings:
1488 decoder = codecs.getdecoder(encoding)
1489 self.assertRaises(TypeError, decoder)
1490 if encoding not in ("idna", "punycode"):
1491 self.assertRaises(TypeError, decoder, 42)
1492
1493 def test_bad_encode_args(self):
1494 for encoding in all_unicode_encodings:
1495 encoder = codecs.getencoder(encoding)
1496 self.assertRaises(TypeError, encoder)
1497
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001498 def test_encoding_map_type_initialized(self):
1499 from encodings import cp1140
1500 # This used to crash, we are only verifying there's no crash.
1501 table_type = type(cp1140.encoding_table)
1502 self.assertEqual(table_type, table_type)
1503
Walter Dörwald3abcb012007-04-16 22:10:50 +00001504 def test_decoder_state(self):
1505 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001506 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001507 for encoding in all_unicode_encodings:
1508 if encoding not in broken_incremental_coders:
1509 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1510 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1511
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001512class CharmapTest(unittest.TestCase):
1513 def test_decode_with_string_map(self):
1514 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001515 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001516 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001517 )
1518
1519 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001520 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001521 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001522 )
1523
1524 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001525 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001526 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001527 )
1528
1529 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001530 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001531 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001532 )
1533
1534 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001535 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001536 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001537 )
1538
Guido van Rossum805365e2007-05-07 22:24:25 +00001539 allbytes = bytes(range(256))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001540 self.assertEquals(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001541 codecs.charmap_decode(allbytes, "ignore", ""),
1542 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001543 )
1544
Thomas Wouters89f507f2006-12-13 04:49:30 +00001545class WithStmtTest(unittest.TestCase):
1546 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001547 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001548 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001549 self.assertEquals(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001550
1551 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001552 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001553 info = codecs.lookup("utf-8")
1554 with codecs.StreamReaderWriter(f, info.streamreader,
1555 info.streamwriter, 'strict') as srw:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001556 self.assertEquals(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001557
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001558class TypesTest(unittest.TestCase):
1559 def test_decode_unicode(self):
1560 # Most decoders don't accept unicode input
1561 decoders = [
1562 codecs.utf_7_decode,
1563 codecs.utf_8_decode,
1564 codecs.utf_16_le_decode,
1565 codecs.utf_16_be_decode,
1566 codecs.utf_16_ex_decode,
1567 codecs.utf_32_decode,
1568 codecs.utf_32_le_decode,
1569 codecs.utf_32_be_decode,
1570 codecs.utf_32_ex_decode,
1571 codecs.latin_1_decode,
1572 codecs.ascii_decode,
1573 codecs.charmap_decode,
1574 ]
1575 if hasattr(codecs, "mbcs_decode"):
1576 decoders.append(codecs.mbcs_decode)
1577 for decoder in decoders:
1578 self.assertRaises(TypeError, decoder, "xxx")
1579
1580 def test_unicode_escape(self):
1581 # Escape-decoding an unicode string is supported ang gives the same
1582 # result as decoding the equivalent ASCII bytes string.
1583 self.assertEquals(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1584 self.assertEquals(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1585 self.assertEquals(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1586 self.assertEquals(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1587
Martin v. Löwis43c57782009-05-10 08:15:24 +00001588class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001589
1590 def test_utf8(self):
1591 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001592 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001593 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001594 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001595 b"foo\x80bar")
1596 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001597 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001598 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001599 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001600 b"\xed\xb0\x80")
1601
1602 def test_ascii(self):
1603 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001604 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001605 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001606 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001607 b"foo\x80bar")
1608
1609 def test_charmap(self):
1610 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001611 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001612 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001613 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001614 b"foo\xa5bar")
1615
Amaury Forgeot d'Arce5344d62009-06-29 22:38:54 +00001616 def test_latin1(self):
1617 # Issue6373
1618 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin1", "surrogateescape"),
1619 b"\xe4\xeb\xef\xf6\xfc")
1620
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001621
Victor Stinner37b82002010-05-22 02:17:42 +00001622class BomTest(unittest.TestCase):
1623 def test_seek0(self):
1624 data = "1234567890"
1625 tests = ("utf-16",
1626 "utf-16-le",
1627 "utf-16-be",
1628 "utf-32",
1629 "utf-32-le",
1630 "utf-32-be")
1631 for encoding in tests:
Victor Stinnerb64d0eb2010-05-22 17:01:13 +00001632 # Check if the BOM is written only once
1633 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner37b82002010-05-22 02:17:42 +00001634 f.write(data)
1635 f.write(data)
1636 f.seek(0)
1637 self.assertEquals(f.read(), data * 2)
1638 f.seek(0)
1639 self.assertEquals(f.read(), data * 2)
1640
Victor Stinnerb64d0eb2010-05-22 17:01:13 +00001641 # Check that the BOM is written after a seek(0)
1642 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1643 f.write(data[0])
1644 self.assertNotEquals(f.tell(), 0)
1645 f.seek(0)
1646 f.write(data)
1647 f.seek(0)
1648 self.assertEquals(f.read(), data)
1649
1650 # (StreamWriter) Check that the BOM is written after a seek(0)
1651 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1652 f.writer.write(data[0])
1653 self.assertNotEquals(f.writer.tell(), 0)
1654 f.writer.seek(0)
1655 f.writer.write(data)
1656 f.seek(0)
1657 self.assertEquals(f.read(), data)
1658
1659 # Check that the BOM is not written after a seek() at a position
1660 # different than the start
1661 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1662 f.write(data)
1663 f.seek(f.tell())
1664 f.write(data)
1665 f.seek(0)
1666 self.assertEquals(f.read(), data * 2)
1667
1668 # (StreamWriter) Check that the BOM is not written after a seek()
1669 # at a position different than the start
1670 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1671 f.writer.write(data)
1672 f.writer.seek(f.writer.tell())
1673 f.writer.write(data)
1674 f.seek(0)
1675 self.assertEquals(f.read(), data * 2)
1676
Victor Stinner37b82002010-05-22 02:17:42 +00001677
Fred Drake2e2be372001-09-20 21:33:42 +00001678def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001679 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00001680 UTF32Test,
1681 UTF32LETest,
1682 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001683 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001684 UTF16LETest,
1685 UTF16BETest,
1686 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001687 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001688 UTF7Test,
1689 UTF16ExTest,
1690 ReadBufferTest,
1691 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001692 RecodingTest,
1693 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001694 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001695 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001696 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001697 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001698 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001699 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001700 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001701 CharmapTest,
1702 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001703 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001704 SurrogateEscapeTest,
Victor Stinner37b82002010-05-22 02:17:42 +00001705 BomTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001706 )
Fred Drake2e2be372001-09-20 21:33:42 +00001707
1708
1709if __name__ == "__main__":
1710 test_main()