blob: 1f465608335137851a69ff057dd0c96f4d047093 [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Victor Stinner98fe1a02011-05-27 01:51:18 +02002import _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Victor Stinner98fe1a02011-05-27 01:51:18 +02004import io
5import sys
6import unittest
7import warnings
Marc-André Lemburga37171d2001-06-19 20:09:28 +00008
Walter Dörwald69652032004-09-07 20:24:22 +00009class Queue(object):
10 """
11 queue: write bytes at one end, read bytes from the other end
12 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000013 def __init__(self, buffer):
14 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000015
16 def write(self, chars):
17 self._buffer += chars
18
19 def read(self, size=-1):
20 if size<0:
21 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000022 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000023 return s
24 else:
25 s = self._buffer[:size]
26 self._buffer = self._buffer[size:]
27 return s
28
Walter Dörwald3abcb012007-04-16 22:10:50 +000029class MixInCheckStateHandling:
30 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000031 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000032 d = codecs.getincrementaldecoder(encoding)()
33 part1 = d.decode(s[:i])
34 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000035 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000036 # Check that the condition stated in the documentation for
37 # IncrementalDecoder.getstate() holds
38 if not state[1]:
39 # reset decoder to the default state without anything buffered
40 d.setstate((state[0][:0], 0))
41 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000042 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000043 # The decoder must return to the same state
44 self.assertEqual(state, d.getstate())
45 # Create a new decoder and set it to the state
46 # we extracted from the old one
47 d = codecs.getincrementaldecoder(encoding)()
48 d.setstate(state)
49 part2 = d.decode(s[i:], True)
50 self.assertEqual(u, part1+part2)
51
52 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000053 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000054 d = codecs.getincrementalencoder(encoding)()
55 part1 = d.encode(u[:i])
56 state = d.getstate()
57 d = codecs.getincrementalencoder(encoding)()
58 d.setstate(state)
59 part2 = d.encode(u[i:], True)
60 self.assertEqual(s, part1+part2)
61
62class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000063 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000064 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000065 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000066 # the StreamReader and check that the results equal the appropriate
67 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000068 q = Queue(b"")
Victor Stinner98fe1a02011-05-27 01:51:18 +020069 with warnings.catch_warnings():
70 warnings.simplefilter("ignore", DeprecationWarning)
71 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000072 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000073 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000074 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000075 result += r.read()
76 self.assertEqual(result, partialresult)
77 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000078 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000079 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000080
Thomas Woutersa9773292006-04-21 09:43:23 +000081 # do the check again, this time using a incremental decoder
82 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000083 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000084 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000085 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000086 self.assertEqual(result, partialresult)
87 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000088 self.assertEqual(d.decode(b"", True), "")
89 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000090
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000091 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000092 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000093 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000094 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000095 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000096 self.assertEqual(result, partialresult)
97 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000098 self.assertEqual(d.decode(b"", True), "")
99 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000100
101 # check iterdecode()
102 encoded = input.encode(self.encoding)
103 self.assertEqual(
104 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000105 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000106 )
107
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000108 def test_readline(self):
109 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000110 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000111 return codecs.getreader(self.encoding)(stream)
112
Walter Dörwaldca199432006-03-06 22:39:12 +0000113 def readalllines(input, keepends=True, size=None):
Victor Stinner98fe1a02011-05-27 01:51:18 +0200114 with warnings.catch_warnings():
115 warnings.simplefilter("ignore", DeprecationWarning)
116 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000117 lines = []
118 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000119 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000120 if not line:
121 break
122 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000123 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000124
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000125 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
126 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
127 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000128 self.assertEqual(readalllines(s, True), sexpected)
129 self.assertEqual(readalllines(s, False), sexpectednoends)
130 self.assertEqual(readalllines(s, True, 10), sexpected)
131 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000132
133 # Test long lines (multiple calls to read() in readline())
134 vw = []
135 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000136 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
137 vw.append((i*200)*"\3042" + lineend)
138 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000139 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
140 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
141
142 # Test lines where the first read might end with \r, so the
143 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000144 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000145 for lineend in "\n \r\n \r \u2028".split():
146 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000147 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000148 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000149 self.assertEqual(
150 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000151 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000152 )
153 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000154 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000155 self.assertEqual(
156 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000157 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000158 )
159
160 def test_bug1175396(self):
161 s = [
162 '<%!--===================================================\r\n',
163 ' BLOG index page: show recent articles,\r\n',
164 ' today\'s articles, or articles of a specific date.\r\n',
165 '========================================================--%>\r\n',
166 '<%@inputencoding="ISO-8859-1"%>\r\n',
167 '<%@pagetemplate=TEMPLATE.y%>\r\n',
168 '<%@import=import frog.util, frog%>\r\n',
169 '<%@import=import frog.objects%>\r\n',
170 '<%@import=from frog.storageerrors import StorageError%>\r\n',
171 '<%\r\n',
172 '\r\n',
173 'import logging\r\n',
174 'log=logging.getLogger("Snakelets.logger")\r\n',
175 '\r\n',
176 '\r\n',
177 'user=self.SessionCtx.user\r\n',
178 'storageEngine=self.SessionCtx.storageEngine\r\n',
179 '\r\n',
180 '\r\n',
181 'def readArticlesFromDate(date, count=None):\r\n',
182 ' entryids=storageEngine.listBlogEntries(date)\r\n',
183 ' entryids.reverse() # descending\r\n',
184 ' if count:\r\n',
185 ' entryids=entryids[:count]\r\n',
186 ' try:\r\n',
187 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
188 ' except StorageError,x:\r\n',
189 ' log.error("Error loading articles: "+str(x))\r\n',
190 ' self.abort("cannot load articles")\r\n',
191 '\r\n',
192 'showdate=None\r\n',
193 '\r\n',
194 'arg=self.Request.getArg()\r\n',
195 'if arg=="today":\r\n',
196 ' #-------------------- TODAY\'S ARTICLES\r\n',
197 ' self.write("<h2>Today\'s articles</h2>")\r\n',
198 ' showdate = frog.util.isodatestr() \r\n',
199 ' entries = readArticlesFromDate(showdate)\r\n',
200 'elif arg=="active":\r\n',
201 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
202 ' self.Yredirect("active.y")\r\n',
203 'elif arg=="login":\r\n',
204 ' #-------------------- LOGIN PAGE redirect\r\n',
205 ' self.Yredirect("login.y")\r\n',
206 'elif arg=="date":\r\n',
207 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
208 ' showdate = self.Request.getParameter("date")\r\n',
209 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
210 ' entries = readArticlesFromDate(showdate)\r\n',
211 'else:\r\n',
212 ' #-------------------- RECENT ARTICLES\r\n',
213 ' self.write("<h2>Recent articles</h2>")\r\n',
214 ' dates=storageEngine.listBlogEntryDates()\r\n',
215 ' if dates:\r\n',
216 ' entries=[]\r\n',
217 ' SHOWAMOUNT=10\r\n',
218 ' for showdate in dates:\r\n',
219 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
220 ' if len(entries)>=SHOWAMOUNT:\r\n',
221 ' break\r\n',
222 ' \r\n',
223 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000224 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner98fe1a02011-05-27 01:51:18 +0200225 with warnings.catch_warnings():
226 warnings.simplefilter("ignore", DeprecationWarning)
227 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000228 for (i, line) in enumerate(reader):
229 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000230
231 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000232 q = Queue(b"")
Victor Stinner98fe1a02011-05-27 01:51:18 +0200233 with warnings.catch_warnings():
234 warnings.simplefilter("ignore", DeprecationWarning)
235 writer = codecs.getwriter(self.encoding)(q)
236 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000237
238 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000239 writer.write("foo\r")
240 self.assertEqual(reader.readline(keepends=False), "foo")
241 writer.write("\nbar\r")
242 self.assertEqual(reader.readline(keepends=False), "")
243 self.assertEqual(reader.readline(keepends=False), "bar")
244 writer.write("baz")
245 self.assertEqual(reader.readline(keepends=False), "baz")
246 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000247
248 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000249 writer.write("foo\r")
250 self.assertEqual(reader.readline(keepends=True), "foo\r")
251 writer.write("\nbar\r")
252 self.assertEqual(reader.readline(keepends=True), "\n")
253 self.assertEqual(reader.readline(keepends=True), "bar\r")
254 writer.write("baz")
255 self.assertEqual(reader.readline(keepends=True), "baz")
256 self.assertEqual(reader.readline(keepends=True), "")
257 writer.write("foo\r\n")
258 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000259
Walter Dörwald9fa09462005-01-10 12:01:39 +0000260 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000261 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
262 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
263 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000264
265 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000266 stream = io.BytesIO(s)
Victor Stinner98fe1a02011-05-27 01:51:18 +0200267 with warnings.catch_warnings():
268 warnings.simplefilter("ignore", DeprecationWarning)
269 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000270 self.assertEqual(reader.readline(), s1)
271 self.assertEqual(reader.readline(), s2)
272 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000273 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000274
275 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000276 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
277 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
278 s3 = "stillokay:bbbbxx\r\n"
279 s4 = "broken!!!!badbad\r\n"
280 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000281
282 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000283 stream = io.BytesIO(s)
Victor Stinner98fe1a02011-05-27 01:51:18 +0200284 with warnings.catch_warnings():
285 warnings.simplefilter("ignore", DeprecationWarning)
286 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000287 self.assertEqual(reader.readline(), s1)
288 self.assertEqual(reader.readline(), s2)
289 self.assertEqual(reader.readline(), s3)
290 self.assertEqual(reader.readline(), s4)
291 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000292 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000293
Walter Dörwald41980ca2007-08-16 21:55:45 +0000294class UTF32Test(ReadTest):
295 encoding = "utf-32"
296
297 spamle = (b'\xff\xfe\x00\x00'
298 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
299 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
300 spambe = (b'\x00\x00\xfe\xff'
301 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
302 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
303
304 def test_only_one_bom(self):
305 _,_,reader,writer = codecs.lookup(self.encoding)
306 # encode some stream
307 s = io.BytesIO()
Victor Stinner98fe1a02011-05-27 01:51:18 +0200308 with warnings.catch_warnings():
309 warnings.simplefilter("ignore", DeprecationWarning)
310 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000311 f.write("spam")
312 f.write("spam")
313 d = s.getvalue()
314 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000315 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000316 # try to read it back
317 s = io.BytesIO(d)
Victor Stinner98fe1a02011-05-27 01:51:18 +0200318 with warnings.catch_warnings():
319 warnings.simplefilter("ignore", DeprecationWarning)
320 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000321 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000322
323 def test_badbom(self):
324 s = io.BytesIO(4*b"\xff")
Victor Stinner98fe1a02011-05-27 01:51:18 +0200325 with warnings.catch_warnings():
326 warnings.simplefilter("ignore", DeprecationWarning)
327 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000328 self.assertRaises(UnicodeError, f.read)
329
330 s = io.BytesIO(8*b"\xff")
Victor Stinner98fe1a02011-05-27 01:51:18 +0200331 with warnings.catch_warnings():
332 warnings.simplefilter("ignore", DeprecationWarning)
333 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000334 self.assertRaises(UnicodeError, f.read)
335
336 def test_partial(self):
337 self.check_partial(
338 "\x00\xff\u0100\uffff",
339 [
340 "", # first byte of BOM read
341 "", # second byte of BOM read
342 "", # third byte of BOM read
343 "", # fourth byte of BOM read => byteorder known
344 "",
345 "",
346 "",
347 "\x00",
348 "\x00",
349 "\x00",
350 "\x00",
351 "\x00\xff",
352 "\x00\xff",
353 "\x00\xff",
354 "\x00\xff",
355 "\x00\xff\u0100",
356 "\x00\xff\u0100",
357 "\x00\xff\u0100",
358 "\x00\xff\u0100",
359 "\x00\xff\u0100\uffff",
360 ]
361 )
362
Georg Brandl791f4e12009-09-17 11:41:24 +0000363 def test_handlers(self):
364 self.assertEqual(('\ufffd', 1),
365 codecs.utf_32_decode(b'\x01', 'replace', True))
366 self.assertEqual(('', 1),
367 codecs.utf_32_decode(b'\x01', 'ignore', True))
368
Walter Dörwald41980ca2007-08-16 21:55:45 +0000369 def test_errors(self):
370 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
371 b"\xff", "strict", True)
372
373 def test_decoder_state(self):
374 self.check_state_handling_decode(self.encoding,
375 "spamspam", self.spamle)
376 self.check_state_handling_decode(self.encoding,
377 "spamspam", self.spambe)
378
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000379 def test_issue8941(self):
380 # Issue #8941: insufficient result allocation when decoding into
381 # surrogate pairs on UCS-2 builds.
382 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
383 self.assertEqual('\U00010000' * 1024,
384 codecs.utf_32_decode(encoded_le)[0])
385 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
386 self.assertEqual('\U00010000' * 1024,
387 codecs.utf_32_decode(encoded_be)[0])
388
Walter Dörwald41980ca2007-08-16 21:55:45 +0000389class UTF32LETest(ReadTest):
390 encoding = "utf-32-le"
391
392 def test_partial(self):
393 self.check_partial(
394 "\x00\xff\u0100\uffff",
395 [
396 "",
397 "",
398 "",
399 "\x00",
400 "\x00",
401 "\x00",
402 "\x00",
403 "\x00\xff",
404 "\x00\xff",
405 "\x00\xff",
406 "\x00\xff",
407 "\x00\xff\u0100",
408 "\x00\xff\u0100",
409 "\x00\xff\u0100",
410 "\x00\xff\u0100",
411 "\x00\xff\u0100\uffff",
412 ]
413 )
414
415 def test_simple(self):
416 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
417
418 def test_errors(self):
419 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
420 b"\xff", "strict", True)
421
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000422 def test_issue8941(self):
423 # Issue #8941: insufficient result allocation when decoding into
424 # surrogate pairs on UCS-2 builds.
425 encoded = b'\x00\x00\x01\x00' * 1024
426 self.assertEqual('\U00010000' * 1024,
427 codecs.utf_32_le_decode(encoded)[0])
428
Walter Dörwald41980ca2007-08-16 21:55:45 +0000429class UTF32BETest(ReadTest):
430 encoding = "utf-32-be"
431
432 def test_partial(self):
433 self.check_partial(
434 "\x00\xff\u0100\uffff",
435 [
436 "",
437 "",
438 "",
439 "\x00",
440 "\x00",
441 "\x00",
442 "\x00",
443 "\x00\xff",
444 "\x00\xff",
445 "\x00\xff",
446 "\x00\xff",
447 "\x00\xff\u0100",
448 "\x00\xff\u0100",
449 "\x00\xff\u0100",
450 "\x00\xff\u0100",
451 "\x00\xff\u0100\uffff",
452 ]
453 )
454
455 def test_simple(self):
456 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
457
458 def test_errors(self):
459 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
460 b"\xff", "strict", True)
461
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000462 def test_issue8941(self):
463 # Issue #8941: insufficient result allocation when decoding into
464 # surrogate pairs on UCS-2 builds.
465 encoded = b'\x00\x01\x00\x00' * 1024
466 self.assertEqual('\U00010000' * 1024,
467 codecs.utf_32_be_decode(encoded)[0])
468
469
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000470class UTF16Test(ReadTest):
471 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000472
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000473 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
474 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000475
476 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000477 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000478 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000479 s = io.BytesIO()
Victor Stinner98fe1a02011-05-27 01:51:18 +0200480 with warnings.catch_warnings():
481 warnings.simplefilter("ignore", DeprecationWarning)
482 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000483 f.write("spam")
484 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000485 d = s.getvalue()
486 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000487 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000488 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000489 s = io.BytesIO(d)
Victor Stinner98fe1a02011-05-27 01:51:18 +0200490 with warnings.catch_warnings():
491 warnings.simplefilter("ignore", DeprecationWarning)
492 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000493 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000494
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000495 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000496 s = io.BytesIO(b"\xff\xff")
Victor Stinner98fe1a02011-05-27 01:51:18 +0200497 with warnings.catch_warnings():
498 warnings.simplefilter("ignore", DeprecationWarning)
499 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000500 self.assertRaises(UnicodeError, f.read)
501
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000502 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner98fe1a02011-05-27 01:51:18 +0200503 with warnings.catch_warnings():
504 warnings.simplefilter("ignore", DeprecationWarning)
505 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000506 self.assertRaises(UnicodeError, f.read)
507
Walter Dörwald69652032004-09-07 20:24:22 +0000508 def test_partial(self):
509 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000510 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000511 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000512 "", # first byte of BOM read
513 "", # second byte of BOM read => byteorder known
514 "",
515 "\x00",
516 "\x00",
517 "\x00\xff",
518 "\x00\xff",
519 "\x00\xff\u0100",
520 "\x00\xff\u0100",
521 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000522 ]
523 )
524
Georg Brandl791f4e12009-09-17 11:41:24 +0000525 def test_handlers(self):
526 self.assertEqual(('\ufffd', 1),
527 codecs.utf_16_decode(b'\x01', 'replace', True))
528 self.assertEqual(('', 1),
529 codecs.utf_16_decode(b'\x01', 'ignore', True))
530
Walter Dörwalde22d3392005-11-17 08:52:34 +0000531 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000532 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000533 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000534
535 def test_decoder_state(self):
536 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000537 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000538 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000539 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000540
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000541 def test_bug691291(self):
542 # Files are always opened in binary mode, even if no binary mode was
543 # specified. This means that no automatic conversion of '\n' is done
544 # on reading and writing.
545 s1 = 'Hello\r\nworld\r\n'
546
547 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200548 self.addCleanup(support.unlink, support.TESTFN)
549 with open(support.TESTFN, 'wb') as fp:
550 fp.write(s)
Victor Stinner98fe1a02011-05-27 01:51:18 +0200551 with codecs.open(support.TESTFN, 'U',
552 encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200553 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000554
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000555class UTF16LETest(ReadTest):
556 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000557
558 def test_partial(self):
559 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000560 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000561 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000562 "",
563 "\x00",
564 "\x00",
565 "\x00\xff",
566 "\x00\xff",
567 "\x00\xff\u0100",
568 "\x00\xff\u0100",
569 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000570 ]
571 )
572
Walter Dörwalde22d3392005-11-17 08:52:34 +0000573 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000574 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000575 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000576
Victor Stinner53a9dd72010-12-08 22:25:45 +0000577 def test_nonbmp(self):
578 self.assertEqual("\U00010203".encode(self.encoding),
579 b'\x00\xd8\x03\xde')
580 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
581 "\U00010203")
582
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000583class UTF16BETest(ReadTest):
584 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000585
586 def test_partial(self):
587 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000588 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000589 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000590 "",
591 "\x00",
592 "\x00",
593 "\x00\xff",
594 "\x00\xff",
595 "\x00\xff\u0100",
596 "\x00\xff\u0100",
597 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000598 ]
599 )
600
Walter Dörwalde22d3392005-11-17 08:52:34 +0000601 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000602 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000603 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000604
Victor Stinner53a9dd72010-12-08 22:25:45 +0000605 def test_nonbmp(self):
606 self.assertEqual("\U00010203".encode(self.encoding),
607 b'\xd8\x00\xde\x03')
608 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
609 "\U00010203")
610
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000611class UTF8Test(ReadTest):
612 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000613
614 def test_partial(self):
615 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000616 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000617 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000618 "\x00",
619 "\x00",
620 "\x00\xff",
621 "\x00\xff",
622 "\x00\xff\u07ff",
623 "\x00\xff\u07ff",
624 "\x00\xff\u07ff",
625 "\x00\xff\u07ff\u0800",
626 "\x00\xff\u07ff\u0800",
627 "\x00\xff\u07ff\u0800",
628 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000629 ]
630 )
631
Walter Dörwald3abcb012007-04-16 22:10:50 +0000632 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000633 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000634 self.check_state_handling_decode(self.encoding,
635 u, u.encode(self.encoding))
636
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000637 def test_lone_surrogates(self):
638 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
639 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000640 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
641 b'[\\udc80]')
642 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
643 b'[&#56448;]')
644 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
645 b'[\x80]')
646 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
647 b'[]')
648 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
649 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000650
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000651 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000652 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
653 b"abc\xed\xa0\x80def")
654 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
655 "abc\ud800def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000656 self.assertTrue(codecs.lookup_error("surrogatepass"))
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000657
Walter Dörwalde22d3392005-11-17 08:52:34 +0000658class UTF7Test(ReadTest):
659 encoding = "utf-7"
660
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000661 def test_partial(self):
662 self.check_partial(
663 "a+-b",
664 [
665 "a",
666 "a",
667 "a+",
668 "a+-",
669 "a+-b",
670 ]
671 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000672
673class UTF16ExTest(unittest.TestCase):
674
675 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000676 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000677
678 def test_bad_args(self):
679 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
680
681class ReadBufferTest(unittest.TestCase):
682
683 def test_array(self):
684 import array
685 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000686 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000687 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000688 )
689
690 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000691 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000692
693 def test_bad_args(self):
694 self.assertRaises(TypeError, codecs.readbuffer_encode)
695 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
696
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000697class UTF8SigTest(ReadTest):
698 encoding = "utf-8-sig"
699
700 def test_partial(self):
701 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000702 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000703 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000704 "",
705 "",
706 "", # First BOM has been read and skipped
707 "",
708 "",
709 "\ufeff", # Second BOM has been read and emitted
710 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000711 "\ufeff\x00", # First byte of encoded "\xff" read
712 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
713 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
714 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000715 "\ufeff\x00\xff\u07ff",
716 "\ufeff\x00\xff\u07ff",
717 "\ufeff\x00\xff\u07ff\u0800",
718 "\ufeff\x00\xff\u07ff\u0800",
719 "\ufeff\x00\xff\u07ff\u0800",
720 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000721 ]
722 )
723
Thomas Wouters89f507f2006-12-13 04:49:30 +0000724 def test_bug1601501(self):
725 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000726 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000727
Walter Dörwald3abcb012007-04-16 22:10:50 +0000728 def test_bom(self):
729 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000730 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000731 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
732
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000733 def test_stream_bom(self):
734 unistring = "ABC\u00A1\u2200XYZ"
735 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
736
737 reader = codecs.getreader("utf-8-sig")
738 for sizehint in [None] + list(range(1, 11)) + \
739 [64, 128, 256, 512, 1024]:
Victor Stinner98fe1a02011-05-27 01:51:18 +0200740 with warnings.catch_warnings():
741 warnings.simplefilter("ignore", DeprecationWarning)
742 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000743 ostream = io.StringIO()
744 while 1:
745 if sizehint is not None:
746 data = istream.read(sizehint)
747 else:
748 data = istream.read()
749
750 if not data:
751 break
752 ostream.write(data)
753
754 got = ostream.getvalue()
755 self.assertEqual(got, unistring)
756
757 def test_stream_bare(self):
758 unistring = "ABC\u00A1\u2200XYZ"
759 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
760
761 reader = codecs.getreader("utf-8-sig")
762 for sizehint in [None] + list(range(1, 11)) + \
763 [64, 128, 256, 512, 1024]:
Victor Stinner98fe1a02011-05-27 01:51:18 +0200764 with warnings.catch_warnings():
765 warnings.simplefilter("ignore", DeprecationWarning)
766 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000767 ostream = io.StringIO()
768 while 1:
769 if sizehint is not None:
770 data = istream.read(sizehint)
771 else:
772 data = istream.read()
773
774 if not data:
775 break
776 ostream.write(data)
777
778 got = ostream.getvalue()
779 self.assertEqual(got, unistring)
780
781class EscapeDecodeTest(unittest.TestCase):
782 def test_empty(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000783 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000784
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000785class RecodingTest(unittest.TestCase):
786 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000787 f = io.BytesIO()
Victor Stinner98fe1a02011-05-27 01:51:18 +0200788 with warnings.catch_warnings():
789 warnings.simplefilter("ignore", DeprecationWarning)
790 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000791 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000792 f2.close()
793 # Python used to crash on this at exit because of a refcount
794 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000795
Martin v. Löwis2548c732003-04-18 10:39:54 +0000796# From RFC 3492
797punycode_testcases = [
798 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000799 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
800 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000801 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000802 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000803 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000804 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000805 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000806 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000807 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000808 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000809 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
810 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
811 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000812 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000813 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000814 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
815 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
816 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000817 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000818 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000819 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000820 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
821 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
822 "\u0939\u0948\u0902",
823 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000824
825 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000826 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000827 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
828 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000829
830 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000831 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
832 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
833 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000834 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
835 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000836
837 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000838 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
839 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
840 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
841 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000842 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000843
844 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000845 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
846 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
847 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
848 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
849 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000850 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000851
852 # (K) Vietnamese:
853 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
854 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000855 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
856 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
857 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
858 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000859 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000860
Martin v. Löwis2548c732003-04-18 10:39:54 +0000861 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000862 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000863 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000864
Martin v. Löwis2548c732003-04-18 10:39:54 +0000865 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000866 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
867 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
868 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000869 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000870
871 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000872 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
873 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
874 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000875 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000876
877 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000878 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000879 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000880
881 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000882 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
883 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000884 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000885
886 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000887 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000888 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000889
890 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000891 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000892 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000893
894 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000895 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
896 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000897 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000898 ]
899
900for i in punycode_testcases:
901 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000902 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000903
904class PunycodeTest(unittest.TestCase):
905 def test_encode(self):
906 for uni, puny in punycode_testcases:
907 # Need to convert both strings to lower case, since
908 # some of the extended encodings use upper case, but our
909 # code produces only lower case. Converting just puny to
910 # lower is also insufficient, since some of the input characters
911 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +0000912 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +0000913 str(uni.encode("punycode"), "ascii").lower(),
914 str(puny, "ascii").lower()
915 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000916
917 def test_decode(self):
918 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000919 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000920 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000921 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000922
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000923class UnicodeInternalTest(unittest.TestCase):
924 def test_bug1251300(self):
925 # Decoding with unicode_internal used to not correctly handle "code
926 # points" above 0x10ffff on UCS-4 builds.
927 if sys.maxunicode > 0xffff:
928 ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000929 (b"\x00\x10\xff\xff", "\U0010ffff"),
930 (b"\x00\x00\x01\x01", "\U00000101"),
931 (b"", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000932 ]
933 not_ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000934 b"\x7f\xff\xff\xff",
935 b"\x80\x00\x00\x00",
936 b"\x81\x00\x00\x00",
937 b"\x00",
938 b"\x00\x00\x00\x00\x00",
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000939 ]
940 for internal, uni in ok:
941 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000942 internal = bytes(reversed(internal))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000943 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000944 for internal in not_ok:
945 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000946 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000947 self.assertRaises(UnicodeDecodeError, internal.decode,
948 "unicode_internal")
949
950 def test_decode_error_attributes(self):
951 if sys.maxunicode > 0xffff:
952 try:
Walter Dörwald092a2252007-06-07 11:26:16 +0000953 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +0000954 except UnicodeDecodeError as ex:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000955 self.assertEqual("unicode_internal", ex.encoding)
956 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
957 self.assertEqual(4, ex.start)
958 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000959 else:
960 self.fail()
961
962 def test_decode_callback(self):
963 if sys.maxunicode > 0xffff:
964 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
965 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossum98297ee2007-11-06 21:34:58 +0000966 ab = "ab".encode("unicode_internal").decode()
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000967 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
968 "ascii"),
969 "UnicodeInternalTest")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000970 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000971
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000972 def test_encode_length(self):
973 # Issue 3739
974 encoder = codecs.getencoder("unicode_internal")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000975 self.assertEqual(encoder("a")[1], 1)
976 self.assertEqual(encoder("\xe9\u0142")[1], 2)
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000977
Ezio Melottib3aedd42010-11-20 19:04:17 +0000978 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +0000979
Martin v. Löwis2548c732003-04-18 10:39:54 +0000980# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
981nameprep_tests = [
982 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000983 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
984 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
985 b'\xb8\x8f\xef\xbb\xbf',
986 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000987 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000988 (b'CAFE',
989 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000990 # 3.3 Case folding 8bit U+00DF (german sharp s).
991 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000992 (b'\xc3\x9f',
993 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000994 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000995 (b'\xc4\xb0',
996 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000997 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000998 (b'\xc5\x83\xcd\xba',
999 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001000 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1001 # XXX: skip this as it fails in UCS-2 mode
1002 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1003 # 'telc\xe2\x88\x95kg\xcf\x83'),
1004 (None, None),
1005 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001006 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1007 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001008 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001009 (b'\xe1\xbe\xb7',
1010 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001011 # 3.9 Self-reverting case folding U+01F0 and normalization.
1012 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001013 (b'\xc7\xb0',
1014 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001015 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001016 (b'\xce\x90',
1017 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001018 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001019 (b'\xce\xb0',
1020 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001021 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001022 (b'\xe1\xba\x96',
1023 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001024 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001025 (b'\xe1\xbd\x96',
1026 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001027 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001028 (b' ',
1029 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001030 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001031 (b'\xc2\xa0',
1032 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001033 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001034 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001035 None),
1036 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001037 (b'\xe2\x80\x80',
1038 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001039 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001040 (b'\xe2\x80\x8b',
1041 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001042 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001043 (b'\xe3\x80\x80',
1044 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001045 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001046 (b'\x10\x7f',
1047 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001048 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001049 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001050 None),
1051 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001052 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001053 None),
1054 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001055 (b'\xef\xbb\xbf',
1056 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001057 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001058 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001059 None),
1060 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001061 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001062 None),
1063 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001064 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001065 None),
1066 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001067 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001068 None),
1069 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001070 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001071 None),
1072 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001073 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001074 None),
1075 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001076 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001077 None),
1078 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001079 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001080 None),
1081 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001082 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001083 None),
1084 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001085 (b'\xcd\x81',
1086 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001087 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001088 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001089 None),
1090 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001091 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001092 None),
1093 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001094 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001095 None),
1096 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001097 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001098 None),
1099 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001100 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001101 None),
1102 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001103 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001104 None),
1105 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001106 (b'foo\xef\xb9\xb6bar',
1107 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001108 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001109 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001110 None),
1111 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001112 (b'\xd8\xa71\xd8\xa8',
1113 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001114 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001115 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001116 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001117 # None),
1118 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001119 # 3.44 Larger test (shrinking).
1120 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001121 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1122 b'\xaa\xce\xb0\xe2\x80\x80',
1123 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001124 # 3.45 Larger test (expanding).
1125 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001126 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1127 b'\x80',
1128 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1129 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1130 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001131 ]
1132
1133
1134class NameprepTest(unittest.TestCase):
1135 def test_nameprep(self):
1136 from encodings.idna import nameprep
1137 for pos, (orig, prepped) in enumerate(nameprep_tests):
1138 if orig is None:
1139 # Skipped
1140 continue
1141 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001142 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001143 if prepped is None:
1144 # Input contains prohibited characters
1145 self.assertRaises(UnicodeError, nameprep, orig)
1146 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001147 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001148 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001149 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001150 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001151 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001152
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001153class IDNACodecTest(unittest.TestCase):
1154 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001155 self.assertEqual(str(b"python.org", "idna"), "python.org")
1156 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1157 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1158 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001159
1160 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001161 self.assertEqual("python.org".encode("idna"), b"python.org")
1162 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1163 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1164 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001165
Martin v. Löwis8b595142005-08-25 11:03:38 +00001166 def test_stream(self):
Victor Stinner98fe1a02011-05-27 01:51:18 +02001167 with warnings.catch_warnings():
1168 warnings.simplefilter("ignore", DeprecationWarning)
1169 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001170 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001171 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001172
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001173 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001174 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001175 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001176 "python.org"
1177 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001178 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001179 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001180 "python.org."
1181 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001182 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001183 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001184 "pyth\xf6n.org."
1185 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001186 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001187 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001188 "pyth\xf6n.org."
1189 )
1190
1191 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001192 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1193 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1194 self.assertEqual(decoder.decode(b"rg"), "")
1195 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001196
1197 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001198 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1199 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1200 self.assertEqual(decoder.decode(b"rg."), "org.")
1201 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001202
1203 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001204 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001205 b"".join(codecs.iterencode("python.org", "idna")),
1206 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001207 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001208 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001209 b"".join(codecs.iterencode("python.org.", "idna")),
1210 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001211 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001212 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001213 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1214 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001215 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001216 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001217 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1218 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001219 )
1220
1221 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001222 self.assertEqual(encoder.encode("\xe4x"), b"")
1223 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1224 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001225
1226 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001227 self.assertEqual(encoder.encode("\xe4x"), b"")
1228 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1229 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001230
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001231class CodecsModuleTest(unittest.TestCase):
1232
1233 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001234 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1235 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001236 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001237 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001238 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001239
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001240 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001241 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1242 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001243 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001244 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001245 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001246 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001247
1248 def test_register(self):
1249 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001250 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001251
1252 def test_lookup(self):
1253 self.assertRaises(TypeError, codecs.lookup)
1254 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001255 self.assertRaises(LookupError, codecs.lookup, " ")
1256
1257 def test_getencoder(self):
1258 self.assertRaises(TypeError, codecs.getencoder)
1259 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1260
1261 def test_getdecoder(self):
1262 self.assertRaises(TypeError, codecs.getdecoder)
1263 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1264
1265 def test_getreader(self):
1266 self.assertRaises(TypeError, codecs.getreader)
1267 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1268
1269 def test_getwriter(self):
1270 self.assertRaises(TypeError, codecs.getwriter)
1271 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001272
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001273class StreamReaderTest(unittest.TestCase):
1274
1275 def setUp(self):
Victor Stinner98fe1a02011-05-27 01:51:18 +02001276 with warnings.catch_warnings():
1277 warnings.simplefilter("ignore", DeprecationWarning)
1278 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001279 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001280
1281 def test_readlines(self):
Victor Stinner98fe1a02011-05-27 01:51:18 +02001282 with warnings.catch_warnings():
1283 warnings.simplefilter("ignore", DeprecationWarning)
1284 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001285 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001286
Thomas Wouters89f507f2006-12-13 04:49:30 +00001287class EncodedFileTest(unittest.TestCase):
1288
1289 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001290 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner98fe1a02011-05-27 01:51:18 +02001291 with warnings.catch_warnings():
1292 warnings.simplefilter("ignore", DeprecationWarning)
1293 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001294 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001295
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001296 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001297 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001298 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001299 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001300
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001301all_unicode_encodings = [
1302 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001303 "big5",
1304 "big5hkscs",
1305 "charmap",
1306 "cp037",
1307 "cp1006",
1308 "cp1026",
1309 "cp1140",
1310 "cp1250",
1311 "cp1251",
1312 "cp1252",
1313 "cp1253",
1314 "cp1254",
1315 "cp1255",
1316 "cp1256",
1317 "cp1257",
1318 "cp1258",
1319 "cp424",
1320 "cp437",
1321 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001322 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001323 "cp737",
1324 "cp775",
1325 "cp850",
1326 "cp852",
1327 "cp855",
1328 "cp856",
1329 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001330 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001331 "cp860",
1332 "cp861",
1333 "cp862",
1334 "cp863",
1335 "cp864",
1336 "cp865",
1337 "cp866",
1338 "cp869",
1339 "cp874",
1340 "cp875",
1341 "cp932",
1342 "cp949",
1343 "cp950",
1344 "euc_jis_2004",
1345 "euc_jisx0213",
1346 "euc_jp",
1347 "euc_kr",
1348 "gb18030",
1349 "gb2312",
1350 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001351 "hp_roman8",
1352 "hz",
1353 "idna",
1354 "iso2022_jp",
1355 "iso2022_jp_1",
1356 "iso2022_jp_2",
1357 "iso2022_jp_2004",
1358 "iso2022_jp_3",
1359 "iso2022_jp_ext",
1360 "iso2022_kr",
1361 "iso8859_1",
1362 "iso8859_10",
1363 "iso8859_11",
1364 "iso8859_13",
1365 "iso8859_14",
1366 "iso8859_15",
1367 "iso8859_16",
1368 "iso8859_2",
1369 "iso8859_3",
1370 "iso8859_4",
1371 "iso8859_5",
1372 "iso8859_6",
1373 "iso8859_7",
1374 "iso8859_8",
1375 "iso8859_9",
1376 "johab",
1377 "koi8_r",
1378 "koi8_u",
1379 "latin_1",
1380 "mac_cyrillic",
1381 "mac_greek",
1382 "mac_iceland",
1383 "mac_latin2",
1384 "mac_roman",
1385 "mac_turkish",
1386 "palmos",
1387 "ptcp154",
1388 "punycode",
1389 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001390 "shift_jis",
1391 "shift_jis_2004",
1392 "shift_jisx0213",
1393 "tis_620",
1394 "unicode_escape",
1395 "unicode_internal",
1396 "utf_16",
1397 "utf_16_be",
1398 "utf_16_le",
1399 "utf_7",
1400 "utf_8",
1401]
1402
1403if hasattr(codecs, "mbcs_encode"):
1404 all_unicode_encodings.append("mbcs")
1405
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001406# The following encoding is not tested, because it's not supposed
1407# to work:
1408# "undefined"
1409
1410# The following encodings don't work in stateful mode
1411broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001412 "punycode",
1413 "unicode_internal"
1414]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001415broken_incremental_coders = broken_unicode_with_streams + [
1416 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001417]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001418
Walter Dörwald3abcb012007-04-16 22:10:50 +00001419class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001420 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001421 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001422 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001423 name = codecs.lookup(encoding).name
1424 if encoding.endswith("_codec"):
1425 name += "_codec"
1426 elif encoding == "latin_1":
1427 name = "latin_1"
1428 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001429 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001430 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001431 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001432 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1433
1434 if encoding not in broken_unicode_with_streams:
1435 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001436 q = Queue(b"")
Victor Stinner98fe1a02011-05-27 01:51:18 +02001437 with warnings.catch_warnings():
1438 warnings.simplefilter("ignore", DeprecationWarning)
1439 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001440 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001441 for c in s:
1442 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001443 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001444 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001445 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001446 q = Queue(b"")
Victor Stinner98fe1a02011-05-27 01:51:18 +02001447 with warnings.catch_warnings():
1448 warnings.simplefilter("ignore", DeprecationWarning)
1449 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001450 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001451 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001452 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001453 decodedresult += reader.read()
1454 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1455
Thomas Wouters89f507f2006-12-13 04:49:30 +00001456 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001457 # check incremental decoder/encoder (fetched via the Python
1458 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001459 try:
1460 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001461 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001462 except LookupError: # no IncrementalEncoder
1463 pass
1464 else:
1465 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001466 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001467 for c in s:
1468 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001469 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001470 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001471 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001472 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001473 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001474 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001475 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1476
1477 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001478 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001479 for c in s:
1480 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001481 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001482 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001483 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001484 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001485 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001486 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001487 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1488
1489 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001490 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001491 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1492
1493 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001494 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1495 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001496
Victor Stinner554f3f02010-06-16 23:33:54 +00001497 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001498 # check incremental decoder/encoder with errors argument
1499 try:
1500 encoder = codecs.getincrementalencoder(encoding)("ignore")
1501 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1502 except LookupError: # no IncrementalEncoder
1503 pass
1504 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001505 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001506 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001507 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001508 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1509
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001510 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001511 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001512 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001513 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1514
Walter Dörwald729c31f2005-03-14 19:06:30 +00001515 def test_seek(self):
1516 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001517 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001518 for encoding in all_unicode_encodings:
1519 if encoding == "idna": # FIXME: See SF bug #1163178
1520 continue
1521 if encoding in broken_unicode_with_streams:
1522 continue
Victor Stinner98fe1a02011-05-27 01:51:18 +02001523 with warnings.catch_warnings():
1524 warnings.simplefilter("ignore", DeprecationWarning)
1525 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001526 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001527 # Test that calling seek resets the internal codec state and buffers
1528 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001529 data = reader.read()
1530 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001531
Walter Dörwalde22d3392005-11-17 08:52:34 +00001532 def test_bad_decode_args(self):
1533 for encoding in all_unicode_encodings:
1534 decoder = codecs.getdecoder(encoding)
1535 self.assertRaises(TypeError, decoder)
1536 if encoding not in ("idna", "punycode"):
1537 self.assertRaises(TypeError, decoder, 42)
1538
1539 def test_bad_encode_args(self):
1540 for encoding in all_unicode_encodings:
1541 encoder = codecs.getencoder(encoding)
1542 self.assertRaises(TypeError, encoder)
1543
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001544 def test_encoding_map_type_initialized(self):
1545 from encodings import cp1140
1546 # This used to crash, we are only verifying there's no crash.
1547 table_type = type(cp1140.encoding_table)
1548 self.assertEqual(table_type, table_type)
1549
Walter Dörwald3abcb012007-04-16 22:10:50 +00001550 def test_decoder_state(self):
1551 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001552 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001553 for encoding in all_unicode_encodings:
1554 if encoding not in broken_incremental_coders:
1555 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1556 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1557
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001558class CharmapTest(unittest.TestCase):
1559 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001560 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001561 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001562 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001563 )
1564
Ezio Melottib3aedd42010-11-20 19:04:17 +00001565 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001566 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001567 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001568 )
1569
Ezio Melottib3aedd42010-11-20 19:04:17 +00001570 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001571 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001572 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001573 )
1574
Ezio Melottib3aedd42010-11-20 19:04:17 +00001575 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001576 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001577 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001578 )
1579
Ezio Melottib3aedd42010-11-20 19:04:17 +00001580 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001581 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001582 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001583 )
1584
Guido van Rossum805365e2007-05-07 22:24:25 +00001585 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001586 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001587 codecs.charmap_decode(allbytes, "ignore", ""),
1588 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001589 )
1590
Thomas Wouters89f507f2006-12-13 04:49:30 +00001591class WithStmtTest(unittest.TestCase):
1592 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001593 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner98fe1a02011-05-27 01:51:18 +02001594 with warnings.catch_warnings():
1595 warnings.simplefilter("ignore", DeprecationWarning)
1596 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1597 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001598
1599 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001600 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001601 info = codecs.lookup("utf-8")
Victor Stinner98fe1a02011-05-27 01:51:18 +02001602 with warnings.catch_warnings():
1603 warnings.simplefilter("ignore", DeprecationWarning)
1604 with codecs.StreamReaderWriter(f, info.streamreader,
1605 info.streamwriter, 'strict') as srw:
1606 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001607
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001608class TypesTest(unittest.TestCase):
1609 def test_decode_unicode(self):
1610 # Most decoders don't accept unicode input
1611 decoders = [
1612 codecs.utf_7_decode,
1613 codecs.utf_8_decode,
1614 codecs.utf_16_le_decode,
1615 codecs.utf_16_be_decode,
1616 codecs.utf_16_ex_decode,
1617 codecs.utf_32_decode,
1618 codecs.utf_32_le_decode,
1619 codecs.utf_32_be_decode,
1620 codecs.utf_32_ex_decode,
1621 codecs.latin_1_decode,
1622 codecs.ascii_decode,
1623 codecs.charmap_decode,
1624 ]
1625 if hasattr(codecs, "mbcs_decode"):
1626 decoders.append(codecs.mbcs_decode)
1627 for decoder in decoders:
1628 self.assertRaises(TypeError, decoder, "xxx")
1629
1630 def test_unicode_escape(self):
1631 # Escape-decoding an unicode string is supported ang gives the same
1632 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001633 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1634 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1635 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1636 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001637
Martin v. Löwis43c57782009-05-10 08:15:24 +00001638class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001639
1640 def test_utf8(self):
1641 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001642 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001643 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001644 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001645 b"foo\x80bar")
1646 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001647 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001648 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001649 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001650 b"\xed\xb0\x80")
1651
1652 def test_ascii(self):
1653 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001654 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001655 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001656 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001657 b"foo\x80bar")
1658
1659 def test_charmap(self):
1660 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001661 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001662 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001663 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001664 b"foo\xa5bar")
1665
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001666 def test_latin1(self):
1667 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001668 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001669 b"\xe4\xeb\xef\xf6\xfc")
1670
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001671
Victor Stinner3fed0872010-05-22 02:16:27 +00001672class BomTest(unittest.TestCase):
1673 def test_seek0(self):
1674 data = "1234567890"
1675 tests = ("utf-16",
1676 "utf-16-le",
1677 "utf-16-be",
1678 "utf-32",
1679 "utf-32-le",
1680 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02001681 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00001682 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001683 # Check if the BOM is written only once
1684 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001685 f.write(data)
1686 f.write(data)
1687 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001688 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001689 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001690 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001691
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001692 # Check that the BOM is written after a seek(0)
1693 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1694 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001695 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001696 f.seek(0)
1697 f.write(data)
1698 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001699 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001700
1701 # (StreamWriter) Check that the BOM is written after a seek(0)
1702 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner98fe1a02011-05-27 01:51:18 +02001703 f.write(data[0])
1704 self.assertNotEqual(f.tell(), 0)
1705 f.seek(0)
1706 f.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001707 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001708 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001709
Victor Stinner98fe1a02011-05-27 01:51:18 +02001710 # Check that the BOM is not written after a seek() at a
1711 # position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001712 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1713 f.write(data)
1714 f.seek(f.tell())
1715 f.write(data)
1716 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001717 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001718
Victor Stinner98fe1a02011-05-27 01:51:18 +02001719 # (StreamWriter) Check that the BOM is not written after a
1720 # seek() at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001721 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner98fe1a02011-05-27 01:51:18 +02001722 f.write(data)
1723 f.seek(f.tell())
1724 f.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001725 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001726 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001727
Victor Stinner3fed0872010-05-22 02:16:27 +00001728
Georg Brandl02524622010-12-02 18:06:51 +00001729bytes_transform_encodings = [
1730 "base64_codec",
1731 "uu_codec",
1732 "quopri_codec",
1733 "hex_codec",
1734]
1735try:
1736 import zlib
1737except ImportError:
1738 pass
1739else:
1740 bytes_transform_encodings.append("zlib_codec")
1741try:
1742 import bz2
1743except ImportError:
1744 pass
1745else:
1746 bytes_transform_encodings.append("bz2_codec")
1747
1748class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001749
Georg Brandl02524622010-12-02 18:06:51 +00001750 def test_basics(self):
1751 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00001752 for encoding in bytes_transform_encodings:
1753 # generic codecs interface
1754 (o, size) = codecs.getencoder(encoding)(binput)
1755 self.assertEqual(size, len(binput))
1756 (i, size) = codecs.getdecoder(encoding)(o)
1757 self.assertEqual(size, len(o))
1758 self.assertEqual(i, binput)
1759
Georg Brandl02524622010-12-02 18:06:51 +00001760 def test_read(self):
1761 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001762 sin = codecs.encode(b"\x80", encoding)
Victor Stinner98fe1a02011-05-27 01:51:18 +02001763 with warnings.catch_warnings():
1764 warnings.simplefilter("ignore", DeprecationWarning)
1765 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00001766 sout = reader.read()
1767 self.assertEqual(sout, b"\x80")
1768
1769 def test_readline(self):
1770 for encoding in bytes_transform_encodings:
1771 if encoding in ['uu_codec', 'zlib_codec']:
1772 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001773 sin = codecs.encode(b"\x80", encoding)
Victor Stinner98fe1a02011-05-27 01:51:18 +02001774 with warnings.catch_warnings():
1775 warnings.simplefilter("ignore", DeprecationWarning)
1776 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00001777 sout = reader.readline()
1778 self.assertEqual(sout, b"\x80")
1779
1780
Fred Drake2e2be372001-09-20 21:33:42 +00001781def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001782 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00001783 UTF32Test,
1784 UTF32LETest,
1785 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001786 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001787 UTF16LETest,
1788 UTF16BETest,
1789 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001790 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001791 UTF7Test,
1792 UTF16ExTest,
1793 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001794 RecodingTest,
1795 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001796 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001797 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001798 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001799 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001800 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001801 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001802 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001803 CharmapTest,
1804 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001805 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001806 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00001807 BomTest,
Georg Brandl02524622010-12-02 18:06:51 +00001808 TransformCodecTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001809 )
Fred Drake2e2be372001-09-20 21:33:42 +00001810
1811
1812if __name__ == "__main__":
1813 test_main()