blob: 0f7c23efccb90e111f8de4535bfd4a1d1c2c6aaa [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Barry Warsaw04f357c2002-07-23 19:04:11 +00002import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00005import sys, _testcapi, io
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Walter Dörwald69652032004-09-07 20:24:22 +00007class Queue(object):
8 """
9 queue: write bytes at one end, read bytes from the other end
10 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000011 def __init__(self, buffer):
12 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000013
14 def write(self, chars):
15 self._buffer += chars
16
17 def read(self, size=-1):
18 if size<0:
19 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000020 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000021 return s
22 else:
23 s = self._buffer[:size]
24 self._buffer = self._buffer[size:]
25 return s
26
Walter Dörwald3abcb012007-04-16 22:10:50 +000027class MixInCheckStateHandling:
28 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000029 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000030 d = codecs.getincrementaldecoder(encoding)()
31 part1 = d.decode(s[:i])
32 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000033 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000034 # Check that the condition stated in the documentation for
35 # IncrementalDecoder.getstate() holds
36 if not state[1]:
37 # reset decoder to the default state without anything buffered
38 d.setstate((state[0][:0], 0))
39 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000040 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000041 # The decoder must return to the same state
42 self.assertEqual(state, d.getstate())
43 # Create a new decoder and set it to the state
44 # we extracted from the old one
45 d = codecs.getincrementaldecoder(encoding)()
46 d.setstate(state)
47 part2 = d.decode(s[i:], True)
48 self.assertEqual(u, part1+part2)
49
50 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000051 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000052 d = codecs.getincrementalencoder(encoding)()
53 part1 = d.encode(u[:i])
54 state = d.getstate()
55 d = codecs.getincrementalencoder(encoding)()
56 d.setstate(state)
57 part2 = d.encode(u[i:], True)
58 self.assertEqual(s, part1+part2)
59
60class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000061 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000062 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000063 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000064 # the StreamReader and check that the results equal the appropriate
65 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000066 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +000067 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000068 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000069 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000070 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000071 result += r.read()
72 self.assertEqual(result, partialresult)
73 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000074 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000075 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000076
Thomas Woutersa9773292006-04-21 09:43:23 +000077 # do the check again, this time using a incremental decoder
78 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000079 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000080 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000081 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000082 self.assertEqual(result, partialresult)
83 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000084 self.assertEqual(d.decode(b"", True), "")
85 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000086
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000088 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000089 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000090 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000091 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000092 self.assertEqual(result, partialresult)
93 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000094 self.assertEqual(d.decode(b"", True), "")
95 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000096
97 # check iterdecode()
98 encoded = input.encode(self.encoding)
99 self.assertEqual(
100 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000102 )
103
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000104 def test_readline(self):
105 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000106 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000107 return codecs.getreader(self.encoding)(stream)
108
Walter Dörwaldca199432006-03-06 22:39:12 +0000109 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000110 reader = getreader(input)
111 lines = []
112 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000113 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000114 if not line:
115 break
116 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000117 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000118
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000119 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
120 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
121 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000122 self.assertEqual(readalllines(s, True), sexpected)
123 self.assertEqual(readalllines(s, False), sexpectednoends)
124 self.assertEqual(readalllines(s, True, 10), sexpected)
125 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000126
127 # Test long lines (multiple calls to read() in readline())
128 vw = []
129 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000130 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
131 vw.append((i*200)*"\3042" + lineend)
132 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000133 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
134 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
135
136 # Test lines where the first read might end with \r, so the
137 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000138 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000139 for lineend in "\n \r\n \r \u2028".split():
140 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000141 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000142 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000143 self.assertEqual(
144 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000145 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000146 )
147 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000148 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000149 self.assertEqual(
150 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000151 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000152 )
153
154 def test_bug1175396(self):
155 s = [
156 '<%!--===================================================\r\n',
157 ' BLOG index page: show recent articles,\r\n',
158 ' today\'s articles, or articles of a specific date.\r\n',
159 '========================================================--%>\r\n',
160 '<%@inputencoding="ISO-8859-1"%>\r\n',
161 '<%@pagetemplate=TEMPLATE.y%>\r\n',
162 '<%@import=import frog.util, frog%>\r\n',
163 '<%@import=import frog.objects%>\r\n',
164 '<%@import=from frog.storageerrors import StorageError%>\r\n',
165 '<%\r\n',
166 '\r\n',
167 'import logging\r\n',
168 'log=logging.getLogger("Snakelets.logger")\r\n',
169 '\r\n',
170 '\r\n',
171 'user=self.SessionCtx.user\r\n',
172 'storageEngine=self.SessionCtx.storageEngine\r\n',
173 '\r\n',
174 '\r\n',
175 'def readArticlesFromDate(date, count=None):\r\n',
176 ' entryids=storageEngine.listBlogEntries(date)\r\n',
177 ' entryids.reverse() # descending\r\n',
178 ' if count:\r\n',
179 ' entryids=entryids[:count]\r\n',
180 ' try:\r\n',
181 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
182 ' except StorageError,x:\r\n',
183 ' log.error("Error loading articles: "+str(x))\r\n',
184 ' self.abort("cannot load articles")\r\n',
185 '\r\n',
186 'showdate=None\r\n',
187 '\r\n',
188 'arg=self.Request.getArg()\r\n',
189 'if arg=="today":\r\n',
190 ' #-------------------- TODAY\'S ARTICLES\r\n',
191 ' self.write("<h2>Today\'s articles</h2>")\r\n',
192 ' showdate = frog.util.isodatestr() \r\n',
193 ' entries = readArticlesFromDate(showdate)\r\n',
194 'elif arg=="active":\r\n',
195 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
196 ' self.Yredirect("active.y")\r\n',
197 'elif arg=="login":\r\n',
198 ' #-------------------- LOGIN PAGE redirect\r\n',
199 ' self.Yredirect("login.y")\r\n',
200 'elif arg=="date":\r\n',
201 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
202 ' showdate = self.Request.getParameter("date")\r\n',
203 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
204 ' entries = readArticlesFromDate(showdate)\r\n',
205 'else:\r\n',
206 ' #-------------------- RECENT ARTICLES\r\n',
207 ' self.write("<h2>Recent articles</h2>")\r\n',
208 ' dates=storageEngine.listBlogEntryDates()\r\n',
209 ' if dates:\r\n',
210 ' entries=[]\r\n',
211 ' SHOWAMOUNT=10\r\n',
212 ' for showdate in dates:\r\n',
213 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
214 ' if len(entries)>=SHOWAMOUNT:\r\n',
215 ' break\r\n',
216 ' \r\n',
217 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000218 stream = io.BytesIO("".join(s).encode(self.encoding))
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000219 reader = codecs.getreader(self.encoding)(stream)
220 for (i, line) in enumerate(reader):
221 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000222
223 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000224 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000225 writer = codecs.getwriter(self.encoding)(q)
226 reader = codecs.getreader(self.encoding)(q)
227
228 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000229 writer.write("foo\r")
230 self.assertEqual(reader.readline(keepends=False), "foo")
231 writer.write("\nbar\r")
232 self.assertEqual(reader.readline(keepends=False), "")
233 self.assertEqual(reader.readline(keepends=False), "bar")
234 writer.write("baz")
235 self.assertEqual(reader.readline(keepends=False), "baz")
236 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000237
238 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000239 writer.write("foo\r")
240 self.assertEqual(reader.readline(keepends=True), "foo\r")
241 writer.write("\nbar\r")
242 self.assertEqual(reader.readline(keepends=True), "\n")
243 self.assertEqual(reader.readline(keepends=True), "bar\r")
244 writer.write("baz")
245 self.assertEqual(reader.readline(keepends=True), "baz")
246 self.assertEqual(reader.readline(keepends=True), "")
247 writer.write("foo\r\n")
248 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000249
Walter Dörwald9fa09462005-01-10 12:01:39 +0000250 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000251 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
252 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
253 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000254
255 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000256 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000257 reader = codecs.getreader(self.encoding)(stream)
258 self.assertEqual(reader.readline(), s1)
259 self.assertEqual(reader.readline(), s2)
260 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000261 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000262
263 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000264 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
265 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
266 s3 = "stillokay:bbbbxx\r\n"
267 s4 = "broken!!!!badbad\r\n"
268 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000269
270 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000271 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000272 reader = codecs.getreader(self.encoding)(stream)
273 self.assertEqual(reader.readline(), s1)
274 self.assertEqual(reader.readline(), s2)
275 self.assertEqual(reader.readline(), s3)
276 self.assertEqual(reader.readline(), s4)
277 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000278 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000279
Walter Dörwald41980ca2007-08-16 21:55:45 +0000280class UTF32Test(ReadTest):
281 encoding = "utf-32"
282
283 spamle = (b'\xff\xfe\x00\x00'
284 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
285 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
286 spambe = (b'\x00\x00\xfe\xff'
287 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
288 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
289
290 def test_only_one_bom(self):
291 _,_,reader,writer = codecs.lookup(self.encoding)
292 # encode some stream
293 s = io.BytesIO()
294 f = writer(s)
295 f.write("spam")
296 f.write("spam")
297 d = s.getvalue()
298 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000299 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000300 # try to read it back
301 s = io.BytesIO(d)
302 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000303 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000304
305 def test_badbom(self):
306 s = io.BytesIO(4*b"\xff")
307 f = codecs.getreader(self.encoding)(s)
308 self.assertRaises(UnicodeError, f.read)
309
310 s = io.BytesIO(8*b"\xff")
311 f = codecs.getreader(self.encoding)(s)
312 self.assertRaises(UnicodeError, f.read)
313
314 def test_partial(self):
315 self.check_partial(
316 "\x00\xff\u0100\uffff",
317 [
318 "", # first byte of BOM read
319 "", # second byte of BOM read
320 "", # third byte of BOM read
321 "", # fourth byte of BOM read => byteorder known
322 "",
323 "",
324 "",
325 "\x00",
326 "\x00",
327 "\x00",
328 "\x00",
329 "\x00\xff",
330 "\x00\xff",
331 "\x00\xff",
332 "\x00\xff",
333 "\x00\xff\u0100",
334 "\x00\xff\u0100",
335 "\x00\xff\u0100",
336 "\x00\xff\u0100",
337 "\x00\xff\u0100\uffff",
338 ]
339 )
340
Georg Brandl791f4e12009-09-17 11:41:24 +0000341 def test_handlers(self):
342 self.assertEqual(('\ufffd', 1),
343 codecs.utf_32_decode(b'\x01', 'replace', True))
344 self.assertEqual(('', 1),
345 codecs.utf_32_decode(b'\x01', 'ignore', True))
346
Walter Dörwald41980ca2007-08-16 21:55:45 +0000347 def test_errors(self):
348 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
349 b"\xff", "strict", True)
350
351 def test_decoder_state(self):
352 self.check_state_handling_decode(self.encoding,
353 "spamspam", self.spamle)
354 self.check_state_handling_decode(self.encoding,
355 "spamspam", self.spambe)
356
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000357 def test_issue8941(self):
358 # Issue #8941: insufficient result allocation when decoding into
359 # surrogate pairs on UCS-2 builds.
360 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
361 self.assertEqual('\U00010000' * 1024,
362 codecs.utf_32_decode(encoded_le)[0])
363 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
364 self.assertEqual('\U00010000' * 1024,
365 codecs.utf_32_decode(encoded_be)[0])
366
Walter Dörwald41980ca2007-08-16 21:55:45 +0000367class UTF32LETest(ReadTest):
368 encoding = "utf-32-le"
369
370 def test_partial(self):
371 self.check_partial(
372 "\x00\xff\u0100\uffff",
373 [
374 "",
375 "",
376 "",
377 "\x00",
378 "\x00",
379 "\x00",
380 "\x00",
381 "\x00\xff",
382 "\x00\xff",
383 "\x00\xff",
384 "\x00\xff",
385 "\x00\xff\u0100",
386 "\x00\xff\u0100",
387 "\x00\xff\u0100",
388 "\x00\xff\u0100",
389 "\x00\xff\u0100\uffff",
390 ]
391 )
392
393 def test_simple(self):
394 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
395
396 def test_errors(self):
397 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
398 b"\xff", "strict", True)
399
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000400 def test_issue8941(self):
401 # Issue #8941: insufficient result allocation when decoding into
402 # surrogate pairs on UCS-2 builds.
403 encoded = b'\x00\x00\x01\x00' * 1024
404 self.assertEqual('\U00010000' * 1024,
405 codecs.utf_32_le_decode(encoded)[0])
406
Walter Dörwald41980ca2007-08-16 21:55:45 +0000407class UTF32BETest(ReadTest):
408 encoding = "utf-32-be"
409
410 def test_partial(self):
411 self.check_partial(
412 "\x00\xff\u0100\uffff",
413 [
414 "",
415 "",
416 "",
417 "\x00",
418 "\x00",
419 "\x00",
420 "\x00",
421 "\x00\xff",
422 "\x00\xff",
423 "\x00\xff",
424 "\x00\xff",
425 "\x00\xff\u0100",
426 "\x00\xff\u0100",
427 "\x00\xff\u0100",
428 "\x00\xff\u0100",
429 "\x00\xff\u0100\uffff",
430 ]
431 )
432
433 def test_simple(self):
434 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
435
436 def test_errors(self):
437 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
438 b"\xff", "strict", True)
439
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000440 def test_issue8941(self):
441 # Issue #8941: insufficient result allocation when decoding into
442 # surrogate pairs on UCS-2 builds.
443 encoded = b'\x00\x01\x00\x00' * 1024
444 self.assertEqual('\U00010000' * 1024,
445 codecs.utf_32_be_decode(encoded)[0])
446
447
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000448class UTF16Test(ReadTest):
449 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000450
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000451 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
452 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000453
454 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000455 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000456 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000457 s = io.BytesIO()
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000458 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000459 f.write("spam")
460 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000461 d = s.getvalue()
462 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000463 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000464 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000465 s = io.BytesIO(d)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000466 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000467 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000468
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000469 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000470 s = io.BytesIO(b"\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000471 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000472 self.assertRaises(UnicodeError, f.read)
473
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000474 s = io.BytesIO(b"\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000475 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000476 self.assertRaises(UnicodeError, f.read)
477
Walter Dörwald69652032004-09-07 20:24:22 +0000478 def test_partial(self):
479 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000480 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000481 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000482 "", # first byte of BOM read
483 "", # second byte of BOM read => byteorder known
484 "",
485 "\x00",
486 "\x00",
487 "\x00\xff",
488 "\x00\xff",
489 "\x00\xff\u0100",
490 "\x00\xff\u0100",
491 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000492 ]
493 )
494
Georg Brandl791f4e12009-09-17 11:41:24 +0000495 def test_handlers(self):
496 self.assertEqual(('\ufffd', 1),
497 codecs.utf_16_decode(b'\x01', 'replace', True))
498 self.assertEqual(('', 1),
499 codecs.utf_16_decode(b'\x01', 'ignore', True))
500
Walter Dörwalde22d3392005-11-17 08:52:34 +0000501 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000502 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000503 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000504
505 def test_decoder_state(self):
506 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000507 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000508 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000509 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000510
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000511 def test_bug691291(self):
512 # Files are always opened in binary mode, even if no binary mode was
513 # specified. This means that no automatic conversion of '\n' is done
514 # on reading and writing.
515 s1 = 'Hello\r\nworld\r\n'
516
517 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200518 self.addCleanup(support.unlink, support.TESTFN)
519 with open(support.TESTFN, 'wb') as fp:
520 fp.write(s)
521 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
522 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000523
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000524class UTF16LETest(ReadTest):
525 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000526
527 def test_partial(self):
528 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000529 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000530 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000531 "",
532 "\x00",
533 "\x00",
534 "\x00\xff",
535 "\x00\xff",
536 "\x00\xff\u0100",
537 "\x00\xff\u0100",
538 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000539 ]
540 )
541
Walter Dörwalde22d3392005-11-17 08:52:34 +0000542 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200543 tests = [
544 (b'\xff', '\ufffd'),
545 (b'A\x00Z', 'A\ufffd'),
546 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
547 (b'\x00\xd8', '\ufffd'),
548 (b'\x00\xd8A', '\ufffd'),
549 (b'\x00\xd8A\x00', '\ufffdA'),
550 (b'\x00\xdcA\x00', '\ufffdA'),
551 ]
552 for raw, expected in tests:
553 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
554 raw, 'strict', True)
555 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000556
Victor Stinner53a9dd72010-12-08 22:25:45 +0000557 def test_nonbmp(self):
558 self.assertEqual("\U00010203".encode(self.encoding),
559 b'\x00\xd8\x03\xde')
560 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
561 "\U00010203")
562
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000563class UTF16BETest(ReadTest):
564 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000565
566 def test_partial(self):
567 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000568 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000569 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000570 "",
571 "\x00",
572 "\x00",
573 "\x00\xff",
574 "\x00\xff",
575 "\x00\xff\u0100",
576 "\x00\xff\u0100",
577 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000578 ]
579 )
580
Walter Dörwalde22d3392005-11-17 08:52:34 +0000581 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200582 tests = [
583 (b'\xff', '\ufffd'),
584 (b'\x00A\xff', 'A\ufffd'),
585 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
586 (b'\xd8\x00', '\ufffd'),
587 (b'\xd8\x00\xdc', '\ufffd'),
588 (b'\xd8\x00\x00A', '\ufffdA'),
589 (b'\xdc\x00\x00A', '\ufffdA'),
590 ]
591 for raw, expected in tests:
592 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
593 raw, 'strict', True)
594 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000595
Victor Stinner53a9dd72010-12-08 22:25:45 +0000596 def test_nonbmp(self):
597 self.assertEqual("\U00010203".encode(self.encoding),
598 b'\xd8\x00\xde\x03')
599 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
600 "\U00010203")
601
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000602class UTF8Test(ReadTest):
603 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000604
605 def test_partial(self):
606 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000607 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000608 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000609 "\x00",
610 "\x00",
611 "\x00\xff",
612 "\x00\xff",
613 "\x00\xff\u07ff",
614 "\x00\xff\u07ff",
615 "\x00\xff\u07ff",
616 "\x00\xff\u07ff\u0800",
617 "\x00\xff\u07ff\u0800",
618 "\x00\xff\u07ff\u0800",
619 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000620 ]
621 )
622
Walter Dörwald3abcb012007-04-16 22:10:50 +0000623 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000624 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000625 self.check_state_handling_decode(self.encoding,
626 u, u.encode(self.encoding))
627
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000628 def test_lone_surrogates(self):
629 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
630 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000631 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
632 b'[\\udc80]')
633 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
634 b'[&#56448;]')
635 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
636 b'[\x80]')
637 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
638 b'[]')
639 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
640 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000641
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000642 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000643 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
644 b"abc\xed\xa0\x80def")
645 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
646 "abc\ud800def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000647 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700648 with self.assertRaises(UnicodeDecodeError):
649 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200650 with self.assertRaises(UnicodeDecodeError):
651 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000652
Walter Dörwalde22d3392005-11-17 08:52:34 +0000653class UTF7Test(ReadTest):
654 encoding = "utf-7"
655
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000656 def test_partial(self):
657 self.check_partial(
658 "a+-b",
659 [
660 "a",
661 "a",
662 "a+",
663 "a+-",
664 "a+-b",
665 ]
666 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000667
668class UTF16ExTest(unittest.TestCase):
669
670 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000671 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000672
673 def test_bad_args(self):
674 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
675
676class ReadBufferTest(unittest.TestCase):
677
678 def test_array(self):
679 import array
680 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000681 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000682 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000683 )
684
685 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000686 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000687
688 def test_bad_args(self):
689 self.assertRaises(TypeError, codecs.readbuffer_encode)
690 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
691
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000692class UTF8SigTest(ReadTest):
693 encoding = "utf-8-sig"
694
695 def test_partial(self):
696 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000697 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000698 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000699 "",
700 "",
701 "", # First BOM has been read and skipped
702 "",
703 "",
704 "\ufeff", # Second BOM has been read and emitted
705 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000706 "\ufeff\x00", # First byte of encoded "\xff" read
707 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
708 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
709 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000710 "\ufeff\x00\xff\u07ff",
711 "\ufeff\x00\xff\u07ff",
712 "\ufeff\x00\xff\u07ff\u0800",
713 "\ufeff\x00\xff\u07ff\u0800",
714 "\ufeff\x00\xff\u07ff\u0800",
715 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000716 ]
717 )
718
Thomas Wouters89f507f2006-12-13 04:49:30 +0000719 def test_bug1601501(self):
720 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000721 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000722
Walter Dörwald3abcb012007-04-16 22:10:50 +0000723 def test_bom(self):
724 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000725 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000726 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
727
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000728 def test_stream_bom(self):
729 unistring = "ABC\u00A1\u2200XYZ"
730 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
731
732 reader = codecs.getreader("utf-8-sig")
733 for sizehint in [None] + list(range(1, 11)) + \
734 [64, 128, 256, 512, 1024]:
735 istream = reader(io.BytesIO(bytestring))
736 ostream = io.StringIO()
737 while 1:
738 if sizehint is not None:
739 data = istream.read(sizehint)
740 else:
741 data = istream.read()
742
743 if not data:
744 break
745 ostream.write(data)
746
747 got = ostream.getvalue()
748 self.assertEqual(got, unistring)
749
750 def test_stream_bare(self):
751 unistring = "ABC\u00A1\u2200XYZ"
752 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
753
754 reader = codecs.getreader("utf-8-sig")
755 for sizehint in [None] + list(range(1, 11)) + \
756 [64, 128, 256, 512, 1024]:
757 istream = reader(io.BytesIO(bytestring))
758 ostream = io.StringIO()
759 while 1:
760 if sizehint is not None:
761 data = istream.read(sizehint)
762 else:
763 data = istream.read()
764
765 if not data:
766 break
767 ostream.write(data)
768
769 got = ostream.getvalue()
770 self.assertEqual(got, unistring)
771
772class EscapeDecodeTest(unittest.TestCase):
773 def test_empty(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000774 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000775
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000776class RecodingTest(unittest.TestCase):
777 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000778 f = io.BytesIO()
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000779 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000780 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000781 f2.close()
782 # Python used to crash on this at exit because of a refcount
783 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000784
Martin v. Löwis2548c732003-04-18 10:39:54 +0000785# From RFC 3492
786punycode_testcases = [
787 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000788 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
789 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000790 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000791 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000792 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000793 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000794 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000795 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000796 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000797 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000798 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
799 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
800 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000801 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000802 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000803 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
804 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
805 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000806 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000807 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000808 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000809 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
810 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
811 "\u0939\u0948\u0902",
812 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000813
814 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000815 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000816 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
817 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000818
819 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000820 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
821 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
822 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000823 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
824 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000825
826 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000827 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
828 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
829 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
830 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000831 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000832
833 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000834 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
835 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
836 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
837 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
838 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000839 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000840
841 # (K) Vietnamese:
842 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
843 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000844 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
845 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
846 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
847 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000848 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000849
Martin v. Löwis2548c732003-04-18 10:39:54 +0000850 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000851 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000852 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000853
Martin v. Löwis2548c732003-04-18 10:39:54 +0000854 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000855 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
856 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
857 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000858 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000859
860 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000861 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
862 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
863 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000864 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000865
866 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000867 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000868 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000869
870 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000871 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
872 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000873 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000874
875 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000876 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000877 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000878
879 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000880 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000881 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000882
883 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000884 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
885 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000886 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000887 ]
888
889for i in punycode_testcases:
890 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000891 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000892
893class PunycodeTest(unittest.TestCase):
894 def test_encode(self):
895 for uni, puny in punycode_testcases:
896 # Need to convert both strings to lower case, since
897 # some of the extended encodings use upper case, but our
898 # code produces only lower case. Converting just puny to
899 # lower is also insufficient, since some of the input characters
900 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +0000901 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +0000902 str(uni.encode("punycode"), "ascii").lower(),
903 str(puny, "ascii").lower()
904 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000905
906 def test_decode(self):
907 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000908 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000909 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000910 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000911
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000912class UnicodeInternalTest(unittest.TestCase):
913 def test_bug1251300(self):
914 # Decoding with unicode_internal used to not correctly handle "code
915 # points" above 0x10ffff on UCS-4 builds.
916 if sys.maxunicode > 0xffff:
917 ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000918 (b"\x00\x10\xff\xff", "\U0010ffff"),
919 (b"\x00\x00\x01\x01", "\U00000101"),
920 (b"", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000921 ]
922 not_ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000923 b"\x7f\xff\xff\xff",
924 b"\x80\x00\x00\x00",
925 b"\x81\x00\x00\x00",
926 b"\x00",
927 b"\x00\x00\x00\x00\x00",
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000928 ]
929 for internal, uni in ok:
930 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000931 internal = bytes(reversed(internal))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000932 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000933 for internal in not_ok:
934 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000935 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000936 self.assertRaises(UnicodeDecodeError, internal.decode,
937 "unicode_internal")
938
939 def test_decode_error_attributes(self):
940 if sys.maxunicode > 0xffff:
941 try:
Walter Dörwald092a2252007-06-07 11:26:16 +0000942 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +0000943 except UnicodeDecodeError as ex:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000944 self.assertEqual("unicode_internal", ex.encoding)
945 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
946 self.assertEqual(4, ex.start)
947 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000948 else:
949 self.fail()
950
951 def test_decode_callback(self):
952 if sys.maxunicode > 0xffff:
953 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
954 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossum98297ee2007-11-06 21:34:58 +0000955 ab = "ab".encode("unicode_internal").decode()
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000956 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
957 "ascii"),
958 "UnicodeInternalTest")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000959 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000960
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000961 def test_encode_length(self):
962 # Issue 3739
963 encoder = codecs.getencoder("unicode_internal")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000964 self.assertEqual(encoder("a")[1], 1)
965 self.assertEqual(encoder("\xe9\u0142")[1], 2)
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000966
Ezio Melottib3aedd42010-11-20 19:04:17 +0000967 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +0000968
Martin v. Löwis2548c732003-04-18 10:39:54 +0000969# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
970nameprep_tests = [
971 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000972 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
973 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
974 b'\xb8\x8f\xef\xbb\xbf',
975 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000976 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000977 (b'CAFE',
978 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000979 # 3.3 Case folding 8bit U+00DF (german sharp s).
980 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000981 (b'\xc3\x9f',
982 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000983 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000984 (b'\xc4\xb0',
985 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000986 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000987 (b'\xc5\x83\xcd\xba',
988 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000989 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
990 # XXX: skip this as it fails in UCS-2 mode
991 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
992 # 'telc\xe2\x88\x95kg\xcf\x83'),
993 (None, None),
994 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000995 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
996 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000997 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000998 (b'\xe1\xbe\xb7',
999 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001000 # 3.9 Self-reverting case folding U+01F0 and normalization.
1001 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001002 (b'\xc7\xb0',
1003 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001004 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001005 (b'\xce\x90',
1006 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001007 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001008 (b'\xce\xb0',
1009 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001010 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001011 (b'\xe1\xba\x96',
1012 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001013 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001014 (b'\xe1\xbd\x96',
1015 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001016 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001017 (b' ',
1018 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001019 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001020 (b'\xc2\xa0',
1021 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001022 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001023 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001024 None),
1025 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001026 (b'\xe2\x80\x80',
1027 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001028 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001029 (b'\xe2\x80\x8b',
1030 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001031 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001032 (b'\xe3\x80\x80',
1033 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001034 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001035 (b'\x10\x7f',
1036 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001037 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001038 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001039 None),
1040 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001041 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001042 None),
1043 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001044 (b'\xef\xbb\xbf',
1045 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001046 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001047 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001048 None),
1049 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001050 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001051 None),
1052 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001053 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001054 None),
1055 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001056 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001057 None),
1058 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001059 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001060 None),
1061 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001062 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001063 None),
1064 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001065 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001066 None),
1067 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001068 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001069 None),
1070 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001071 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001072 None),
1073 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001074 (b'\xcd\x81',
1075 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001076 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001077 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001078 None),
1079 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001080 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001081 None),
1082 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001083 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001084 None),
1085 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001086 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001087 None),
1088 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001089 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001090 None),
1091 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001092 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001093 None),
1094 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001095 (b'foo\xef\xb9\xb6bar',
1096 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001097 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001098 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001099 None),
1100 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001101 (b'\xd8\xa71\xd8\xa8',
1102 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001103 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001104 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001105 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001106 # None),
1107 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001108 # 3.44 Larger test (shrinking).
1109 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001110 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1111 b'\xaa\xce\xb0\xe2\x80\x80',
1112 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001113 # 3.45 Larger test (expanding).
1114 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001115 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1116 b'\x80',
1117 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1118 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1119 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001120 ]
1121
1122
1123class NameprepTest(unittest.TestCase):
1124 def test_nameprep(self):
1125 from encodings.idna import nameprep
1126 for pos, (orig, prepped) in enumerate(nameprep_tests):
1127 if orig is None:
1128 # Skipped
1129 continue
1130 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001131 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001132 if prepped is None:
1133 # Input contains prohibited characters
1134 self.assertRaises(UnicodeError, nameprep, orig)
1135 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001136 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001137 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001138 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001139 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001140 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001141
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001142class IDNACodecTest(unittest.TestCase):
1143 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001144 self.assertEqual(str(b"python.org", "idna"), "python.org")
1145 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1146 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1147 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001148
1149 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001150 self.assertEqual("python.org".encode("idna"), b"python.org")
1151 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1152 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1153 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001154
Martin v. Löwis8b595142005-08-25 11:03:38 +00001155 def test_stream(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001156 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001157 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001158 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001159
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001160 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001161 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001162 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001163 "python.org"
1164 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001165 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001166 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001167 "python.org."
1168 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001169 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001170 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001171 "pyth\xf6n.org."
1172 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001173 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001174 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001175 "pyth\xf6n.org."
1176 )
1177
1178 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001179 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1180 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1181 self.assertEqual(decoder.decode(b"rg"), "")
1182 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001183
1184 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001185 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1186 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1187 self.assertEqual(decoder.decode(b"rg."), "org.")
1188 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001189
1190 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001191 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001192 b"".join(codecs.iterencode("python.org", "idna")),
1193 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001194 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001195 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001196 b"".join(codecs.iterencode("python.org.", "idna")),
1197 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001198 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001199 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001200 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1201 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001202 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001203 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001204 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1205 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001206 )
1207
1208 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001209 self.assertEqual(encoder.encode("\xe4x"), b"")
1210 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1211 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001212
1213 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001214 self.assertEqual(encoder.encode("\xe4x"), b"")
1215 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1216 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001217
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001218class CodecsModuleTest(unittest.TestCase):
1219
1220 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001221 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1222 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001223 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001224 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001225 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001226
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001227 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001228 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1229 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001230 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001231 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001232 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001233 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001234
1235 def test_register(self):
1236 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001237 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001238
1239 def test_lookup(self):
1240 self.assertRaises(TypeError, codecs.lookup)
1241 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001242 self.assertRaises(LookupError, codecs.lookup, " ")
1243
1244 def test_getencoder(self):
1245 self.assertRaises(TypeError, codecs.getencoder)
1246 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1247
1248 def test_getdecoder(self):
1249 self.assertRaises(TypeError, codecs.getdecoder)
1250 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1251
1252 def test_getreader(self):
1253 self.assertRaises(TypeError, codecs.getreader)
1254 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1255
1256 def test_getwriter(self):
1257 self.assertRaises(TypeError, codecs.getwriter)
1258 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001259
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001260 def test_lookup_issue1813(self):
1261 # Issue #1813: under Turkish locales, lookup of some codecs failed
1262 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitrou2a20f9b2011-07-27 01:06:07 +02001263 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001264 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1265 try:
1266 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1267 except locale.Error:
1268 # Unsupported locale on this system
1269 self.skipTest('test needs Turkish locale')
1270 c = codecs.lookup('ASCII')
1271 self.assertEqual(c.name, 'ascii')
1272
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001273class StreamReaderTest(unittest.TestCase):
1274
1275 def setUp(self):
1276 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001277 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001278
1279 def test_readlines(self):
1280 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001281 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001282
Thomas Wouters89f507f2006-12-13 04:49:30 +00001283class EncodedFileTest(unittest.TestCase):
1284
1285 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001286 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001287 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001288 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001289
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001290 f = io.BytesIO()
Thomas Wouters89f507f2006-12-13 04:49:30 +00001291 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001292 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001293 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001294
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001295all_unicode_encodings = [
1296 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001297 "big5",
1298 "big5hkscs",
1299 "charmap",
1300 "cp037",
1301 "cp1006",
1302 "cp1026",
1303 "cp1140",
1304 "cp1250",
1305 "cp1251",
1306 "cp1252",
1307 "cp1253",
1308 "cp1254",
1309 "cp1255",
1310 "cp1256",
1311 "cp1257",
1312 "cp1258",
1313 "cp424",
1314 "cp437",
1315 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001316 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001317 "cp737",
1318 "cp775",
1319 "cp850",
1320 "cp852",
1321 "cp855",
1322 "cp856",
1323 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001324 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001325 "cp860",
1326 "cp861",
1327 "cp862",
1328 "cp863",
1329 "cp864",
1330 "cp865",
1331 "cp866",
1332 "cp869",
1333 "cp874",
1334 "cp875",
1335 "cp932",
1336 "cp949",
1337 "cp950",
1338 "euc_jis_2004",
1339 "euc_jisx0213",
1340 "euc_jp",
1341 "euc_kr",
1342 "gb18030",
1343 "gb2312",
1344 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001345 "hp_roman8",
1346 "hz",
1347 "idna",
1348 "iso2022_jp",
1349 "iso2022_jp_1",
1350 "iso2022_jp_2",
1351 "iso2022_jp_2004",
1352 "iso2022_jp_3",
1353 "iso2022_jp_ext",
1354 "iso2022_kr",
1355 "iso8859_1",
1356 "iso8859_10",
1357 "iso8859_11",
1358 "iso8859_13",
1359 "iso8859_14",
1360 "iso8859_15",
1361 "iso8859_16",
1362 "iso8859_2",
1363 "iso8859_3",
1364 "iso8859_4",
1365 "iso8859_5",
1366 "iso8859_6",
1367 "iso8859_7",
1368 "iso8859_8",
1369 "iso8859_9",
1370 "johab",
1371 "koi8_r",
1372 "koi8_u",
1373 "latin_1",
1374 "mac_cyrillic",
1375 "mac_greek",
1376 "mac_iceland",
1377 "mac_latin2",
1378 "mac_roman",
1379 "mac_turkish",
1380 "palmos",
1381 "ptcp154",
1382 "punycode",
1383 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001384 "shift_jis",
1385 "shift_jis_2004",
1386 "shift_jisx0213",
1387 "tis_620",
1388 "unicode_escape",
1389 "unicode_internal",
1390 "utf_16",
1391 "utf_16_be",
1392 "utf_16_le",
1393 "utf_7",
1394 "utf_8",
1395]
1396
1397if hasattr(codecs, "mbcs_encode"):
1398 all_unicode_encodings.append("mbcs")
1399
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001400# The following encoding is not tested, because it's not supposed
1401# to work:
1402# "undefined"
1403
1404# The following encodings don't work in stateful mode
1405broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001406 "punycode",
1407 "unicode_internal"
1408]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001409broken_incremental_coders = broken_unicode_with_streams + [
1410 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001411]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001412
Walter Dörwald3abcb012007-04-16 22:10:50 +00001413class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001414 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001415 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001416 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001417 name = codecs.lookup(encoding).name
1418 if encoding.endswith("_codec"):
1419 name += "_codec"
1420 elif encoding == "latin_1":
1421 name = "latin_1"
1422 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001423 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001424 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001425 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001426 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1427
1428 if encoding not in broken_unicode_with_streams:
1429 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001430 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001431 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001432 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001433 for c in s:
1434 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001435 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001436 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001437 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001438 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001439 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001440 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001441 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001442 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001443 decodedresult += reader.read()
1444 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1445
Thomas Wouters89f507f2006-12-13 04:49:30 +00001446 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001447 # check incremental decoder/encoder (fetched via the Python
1448 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001449 try:
1450 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001451 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001452 except LookupError: # no IncrementalEncoder
1453 pass
1454 else:
1455 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001456 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001457 for c in s:
1458 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001459 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001460 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001461 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001462 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001463 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001464 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001465 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1466
1467 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001468 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001469 for c in s:
1470 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001471 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001472 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001473 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001474 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001475 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001476 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001477 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1478
1479 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001480 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001481 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1482
1483 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001484 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1485 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001486
Victor Stinner554f3f02010-06-16 23:33:54 +00001487 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001488 # check incremental decoder/encoder with errors argument
1489 try:
1490 encoder = codecs.getincrementalencoder(encoding)("ignore")
1491 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1492 except LookupError: # no IncrementalEncoder
1493 pass
1494 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001495 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001496 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001497 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001498 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1499
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001500 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001501 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001502 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001503 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1504
Walter Dörwald729c31f2005-03-14 19:06:30 +00001505 def test_seek(self):
1506 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001507 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001508 for encoding in all_unicode_encodings:
1509 if encoding == "idna": # FIXME: See SF bug #1163178
1510 continue
1511 if encoding in broken_unicode_with_streams:
1512 continue
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001513 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001514 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001515 # Test that calling seek resets the internal codec state and buffers
1516 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001517 data = reader.read()
1518 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001519
Walter Dörwalde22d3392005-11-17 08:52:34 +00001520 def test_bad_decode_args(self):
1521 for encoding in all_unicode_encodings:
1522 decoder = codecs.getdecoder(encoding)
1523 self.assertRaises(TypeError, decoder)
1524 if encoding not in ("idna", "punycode"):
1525 self.assertRaises(TypeError, decoder, 42)
1526
1527 def test_bad_encode_args(self):
1528 for encoding in all_unicode_encodings:
1529 encoder = codecs.getencoder(encoding)
1530 self.assertRaises(TypeError, encoder)
1531
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001532 def test_encoding_map_type_initialized(self):
1533 from encodings import cp1140
1534 # This used to crash, we are only verifying there's no crash.
1535 table_type = type(cp1140.encoding_table)
1536 self.assertEqual(table_type, table_type)
1537
Walter Dörwald3abcb012007-04-16 22:10:50 +00001538 def test_decoder_state(self):
1539 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001540 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001541 for encoding in all_unicode_encodings:
1542 if encoding not in broken_incremental_coders:
1543 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1544 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1545
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001546class CharmapTest(unittest.TestCase):
1547 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001548 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001549 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001550 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001551 )
1552
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001553 self.assertRaises(UnicodeDecodeError,
1554 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1555 )
1556
Ezio Melottib3aedd42010-11-20 19:04:17 +00001557 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001558 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001559 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001560 )
1561
Ezio Melottib3aedd42010-11-20 19:04:17 +00001562 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001563 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001564 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001565 )
1566
Ezio Melottib3aedd42010-11-20 19:04:17 +00001567 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001568 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001569 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001570 )
1571
Ezio Melottib3aedd42010-11-20 19:04:17 +00001572 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001573 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001574 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001575 )
1576
Guido van Rossum805365e2007-05-07 22:24:25 +00001577 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001578 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001579 codecs.charmap_decode(allbytes, "ignore", ""),
1580 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001581 )
1582
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001583 def test_decode_with_int2str_map(self):
1584 self.assertEqual(
1585 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1586 {0: 'a', 1: 'b', 2: 'c'}),
1587 ("abc", 3)
1588 )
1589
1590 self.assertEqual(
1591 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1592 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1593 ("AaBbCc", 3)
1594 )
1595
1596 self.assertEqual(
1597 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1598 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1599 ("\U0010FFFFbc", 3)
1600 )
1601
1602 self.assertEqual(
1603 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1604 {0: 'a', 1: 'b', 2: ''}),
1605 ("ab", 3)
1606 )
1607
1608 self.assertRaises(UnicodeDecodeError,
1609 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1610 {0: 'a', 1: 'b'}
1611 )
1612
1613 self.assertEqual(
1614 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1615 {0: 'a', 1: 'b'}),
1616 ("ab\ufffd", 3)
1617 )
1618
1619 self.assertEqual(
1620 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1621 {0: 'a', 1: 'b', 2: None}),
1622 ("ab\ufffd", 3)
1623 )
1624
1625 self.assertEqual(
1626 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1627 {0: 'a', 1: 'b'}),
1628 ("ab", 3)
1629 )
1630
1631 self.assertEqual(
1632 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1633 {0: 'a', 1: 'b', 2: None}),
1634 ("ab", 3)
1635 )
1636
1637 allbytes = bytes(range(256))
1638 self.assertEqual(
1639 codecs.charmap_decode(allbytes, "ignore", {}),
1640 ("", len(allbytes))
1641 )
1642
1643 def test_decode_with_int2int_map(self):
1644 a = ord('a')
1645 b = ord('b')
1646 c = ord('c')
1647
1648 self.assertEqual(
1649 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1650 {0: a, 1: b, 2: c}),
1651 ("abc", 3)
1652 )
1653
1654 # Issue #15379
1655 self.assertEqual(
1656 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1657 {0: 0x10FFFF, 1: b, 2: c}),
1658 ("\U0010FFFFbc", 3)
1659 )
1660
1661 self.assertRaises(TypeError,
1662 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1663 {0: 0x110000, 1: b, 2: c}
1664 )
1665
1666 self.assertRaises(UnicodeDecodeError,
1667 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1668 {0: a, 1: b},
1669 )
1670
1671 self.assertEqual(
1672 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1673 {0: a, 1: b}),
1674 ("ab\ufffd", 3)
1675 )
1676
1677 self.assertEqual(
1678 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1679 {0: a, 1: b}),
1680 ("ab", 3)
1681 )
1682
1683
Thomas Wouters89f507f2006-12-13 04:49:30 +00001684class WithStmtTest(unittest.TestCase):
1685 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001686 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001687 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001688 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001689
1690 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001691 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001692 info = codecs.lookup("utf-8")
1693 with codecs.StreamReaderWriter(f, info.streamreader,
1694 info.streamwriter, 'strict') as srw:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001695 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001696
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001697class TypesTest(unittest.TestCase):
1698 def test_decode_unicode(self):
1699 # Most decoders don't accept unicode input
1700 decoders = [
1701 codecs.utf_7_decode,
1702 codecs.utf_8_decode,
1703 codecs.utf_16_le_decode,
1704 codecs.utf_16_be_decode,
1705 codecs.utf_16_ex_decode,
1706 codecs.utf_32_decode,
1707 codecs.utf_32_le_decode,
1708 codecs.utf_32_be_decode,
1709 codecs.utf_32_ex_decode,
1710 codecs.latin_1_decode,
1711 codecs.ascii_decode,
1712 codecs.charmap_decode,
1713 ]
1714 if hasattr(codecs, "mbcs_decode"):
1715 decoders.append(codecs.mbcs_decode)
1716 for decoder in decoders:
1717 self.assertRaises(TypeError, decoder, "xxx")
1718
1719 def test_unicode_escape(self):
1720 # Escape-decoding an unicode string is supported ang gives the same
1721 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001722 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1723 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1724 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1725 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001726
Martin v. Löwis43c57782009-05-10 08:15:24 +00001727class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001728
1729 def test_utf8(self):
1730 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001731 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001732 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001733 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001734 b"foo\x80bar")
1735 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001736 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001737 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001738 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001739 b"\xed\xb0\x80")
1740
1741 def test_ascii(self):
1742 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001743 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001744 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001745 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001746 b"foo\x80bar")
1747
1748 def test_charmap(self):
1749 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001750 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001751 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001752 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001753 b"foo\xa5bar")
1754
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001755 def test_latin1(self):
1756 # Issue6373
1757 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin1", "surrogateescape"),
1758 b"\xe4\xeb\xef\xf6\xfc")
1759
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001760
Victor Stinner3fed0872010-05-22 02:16:27 +00001761class BomTest(unittest.TestCase):
1762 def test_seek0(self):
1763 data = "1234567890"
1764 tests = ("utf-16",
1765 "utf-16-le",
1766 "utf-16-be",
1767 "utf-32",
1768 "utf-32-le",
1769 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02001770 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00001771 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001772 # Check if the BOM is written only once
1773 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001774 f.write(data)
1775 f.write(data)
1776 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001777 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001778 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001779 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001780
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001781 # Check that the BOM is written after a seek(0)
1782 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1783 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001784 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001785 f.seek(0)
1786 f.write(data)
1787 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001788 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001789
1790 # (StreamWriter) Check that the BOM is written after a seek(0)
1791 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1792 f.writer.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001793 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001794 f.writer.seek(0)
1795 f.writer.write(data)
1796 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001797 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001798
1799 # Check that the BOM is not written after a seek() at a position
1800 # different than the start
1801 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1802 f.write(data)
1803 f.seek(f.tell())
1804 f.write(data)
1805 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001806 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001807
1808 # (StreamWriter) Check that the BOM is not written after a seek()
1809 # at a position different than the start
1810 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1811 f.writer.write(data)
1812 f.writer.seek(f.writer.tell())
1813 f.writer.write(data)
1814 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001815 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001816
Victor Stinner3fed0872010-05-22 02:16:27 +00001817
Georg Brandl02524622010-12-02 18:06:51 +00001818bytes_transform_encodings = [
1819 "base64_codec",
1820 "uu_codec",
1821 "quopri_codec",
1822 "hex_codec",
1823]
1824try:
1825 import zlib
1826except ImportError:
1827 pass
1828else:
1829 bytes_transform_encodings.append("zlib_codec")
1830try:
1831 import bz2
1832except ImportError:
1833 pass
1834else:
1835 bytes_transform_encodings.append("bz2_codec")
1836
1837class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001838
Georg Brandl02524622010-12-02 18:06:51 +00001839 def test_basics(self):
1840 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00001841 for encoding in bytes_transform_encodings:
1842 # generic codecs interface
1843 (o, size) = codecs.getencoder(encoding)(binput)
1844 self.assertEqual(size, len(binput))
1845 (i, size) = codecs.getdecoder(encoding)(o)
1846 self.assertEqual(size, len(o))
1847 self.assertEqual(i, binput)
1848
Georg Brandl02524622010-12-02 18:06:51 +00001849 def test_read(self):
1850 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001851 sin = codecs.encode(b"\x80", encoding)
Georg Brandl02524622010-12-02 18:06:51 +00001852 reader = codecs.getreader(encoding)(io.BytesIO(sin))
1853 sout = reader.read()
1854 self.assertEqual(sout, b"\x80")
1855
1856 def test_readline(self):
1857 for encoding in bytes_transform_encodings:
1858 if encoding in ['uu_codec', 'zlib_codec']:
1859 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001860 sin = codecs.encode(b"\x80", encoding)
Georg Brandl02524622010-12-02 18:06:51 +00001861 reader = codecs.getreader(encoding)(io.BytesIO(sin))
1862 sout = reader.readline()
1863 self.assertEqual(sout, b"\x80")
1864
1865
Fred Drake2e2be372001-09-20 21:33:42 +00001866def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001867 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00001868 UTF32Test,
1869 UTF32LETest,
1870 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001871 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001872 UTF16LETest,
1873 UTF16BETest,
1874 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001875 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001876 UTF7Test,
1877 UTF16ExTest,
1878 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001879 RecodingTest,
1880 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001881 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001882 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001883 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001884 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001885 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001886 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001887 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001888 CharmapTest,
1889 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001890 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001891 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00001892 BomTest,
Georg Brandl02524622010-12-02 18:06:51 +00001893 TransformCodecTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001894 )
Fred Drake2e2be372001-09-20 21:33:42 +00001895
1896
1897if __name__ == "__main__":
1898 test_main()