blob: 42d0da3e703bba9db6f69db0eceee986329a1675 [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Barry Warsaw04f357c2002-07-23 19:04:11 +00002import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00005import sys, _testcapi, io
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Walter Dörwald69652032004-09-07 20:24:22 +00007class Queue(object):
8 """
9 queue: write bytes at one end, read bytes from the other end
10 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000011 def __init__(self, buffer):
12 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000013
14 def write(self, chars):
15 self._buffer += chars
16
17 def read(self, size=-1):
18 if size<0:
19 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000020 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000021 return s
22 else:
23 s = self._buffer[:size]
24 self._buffer = self._buffer[size:]
25 return s
26
Walter Dörwald3abcb012007-04-16 22:10:50 +000027class MixInCheckStateHandling:
28 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000029 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000030 d = codecs.getincrementaldecoder(encoding)()
31 part1 = d.decode(s[:i])
32 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000033 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000034 # Check that the condition stated in the documentation for
35 # IncrementalDecoder.getstate() holds
36 if not state[1]:
37 # reset decoder to the default state without anything buffered
38 d.setstate((state[0][:0], 0))
39 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000040 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000041 # The decoder must return to the same state
42 self.assertEqual(state, d.getstate())
43 # Create a new decoder and set it to the state
44 # we extracted from the old one
45 d = codecs.getincrementaldecoder(encoding)()
46 d.setstate(state)
47 part2 = d.decode(s[i:], True)
48 self.assertEqual(u, part1+part2)
49
50 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000051 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000052 d = codecs.getincrementalencoder(encoding)()
53 part1 = d.encode(u[:i])
54 state = d.getstate()
55 d = codecs.getincrementalencoder(encoding)()
56 d.setstate(state)
57 part2 = d.encode(u[i:], True)
58 self.assertEqual(s, part1+part2)
59
60class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000061 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000062 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000063 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000064 # the StreamReader and check that the results equal the appropriate
65 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000066 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +000067 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000068 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000069 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000070 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000071 result += r.read()
72 self.assertEqual(result, partialresult)
73 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000074 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000075 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000076
Thomas Woutersa9773292006-04-21 09:43:23 +000077 # do the check again, this time using a incremental decoder
78 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000079 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000080 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000081 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000082 self.assertEqual(result, partialresult)
83 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000084 self.assertEqual(d.decode(b"", True), "")
85 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000086
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000088 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000089 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000090 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000091 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000092 self.assertEqual(result, partialresult)
93 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000094 self.assertEqual(d.decode(b"", True), "")
95 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000096
97 # check iterdecode()
98 encoded = input.encode(self.encoding)
99 self.assertEqual(
100 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000102 )
103
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000104 def test_readline(self):
105 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000106 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000107 return codecs.getreader(self.encoding)(stream)
108
Walter Dörwaldca199432006-03-06 22:39:12 +0000109 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000110 reader = getreader(input)
111 lines = []
112 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000113 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000114 if not line:
115 break
116 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000117 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000118
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000119 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
120 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
121 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000122 self.assertEqual(readalllines(s, True), sexpected)
123 self.assertEqual(readalllines(s, False), sexpectednoends)
124 self.assertEqual(readalllines(s, True, 10), sexpected)
125 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000126
127 # Test long lines (multiple calls to read() in readline())
128 vw = []
129 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000130 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
131 vw.append((i*200)*"\3042" + lineend)
132 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000133 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
134 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
135
136 # Test lines where the first read might end with \r, so the
137 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000138 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000139 for lineend in "\n \r\n \r \u2028".split():
140 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000141 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000142 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000143 self.assertEqual(
144 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000145 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000146 )
147 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000148 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000149 self.assertEqual(
150 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000151 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000152 )
153
154 def test_bug1175396(self):
155 s = [
156 '<%!--===================================================\r\n',
157 ' BLOG index page: show recent articles,\r\n',
158 ' today\'s articles, or articles of a specific date.\r\n',
159 '========================================================--%>\r\n',
160 '<%@inputencoding="ISO-8859-1"%>\r\n',
161 '<%@pagetemplate=TEMPLATE.y%>\r\n',
162 '<%@import=import frog.util, frog%>\r\n',
163 '<%@import=import frog.objects%>\r\n',
164 '<%@import=from frog.storageerrors import StorageError%>\r\n',
165 '<%\r\n',
166 '\r\n',
167 'import logging\r\n',
168 'log=logging.getLogger("Snakelets.logger")\r\n',
169 '\r\n',
170 '\r\n',
171 'user=self.SessionCtx.user\r\n',
172 'storageEngine=self.SessionCtx.storageEngine\r\n',
173 '\r\n',
174 '\r\n',
175 'def readArticlesFromDate(date, count=None):\r\n',
176 ' entryids=storageEngine.listBlogEntries(date)\r\n',
177 ' entryids.reverse() # descending\r\n',
178 ' if count:\r\n',
179 ' entryids=entryids[:count]\r\n',
180 ' try:\r\n',
181 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
182 ' except StorageError,x:\r\n',
183 ' log.error("Error loading articles: "+str(x))\r\n',
184 ' self.abort("cannot load articles")\r\n',
185 '\r\n',
186 'showdate=None\r\n',
187 '\r\n',
188 'arg=self.Request.getArg()\r\n',
189 'if arg=="today":\r\n',
190 ' #-------------------- TODAY\'S ARTICLES\r\n',
191 ' self.write("<h2>Today\'s articles</h2>")\r\n',
192 ' showdate = frog.util.isodatestr() \r\n',
193 ' entries = readArticlesFromDate(showdate)\r\n',
194 'elif arg=="active":\r\n',
195 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
196 ' self.Yredirect("active.y")\r\n',
197 'elif arg=="login":\r\n',
198 ' #-------------------- LOGIN PAGE redirect\r\n',
199 ' self.Yredirect("login.y")\r\n',
200 'elif arg=="date":\r\n',
201 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
202 ' showdate = self.Request.getParameter("date")\r\n',
203 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
204 ' entries = readArticlesFromDate(showdate)\r\n',
205 'else:\r\n',
206 ' #-------------------- RECENT ARTICLES\r\n',
207 ' self.write("<h2>Recent articles</h2>")\r\n',
208 ' dates=storageEngine.listBlogEntryDates()\r\n',
209 ' if dates:\r\n',
210 ' entries=[]\r\n',
211 ' SHOWAMOUNT=10\r\n',
212 ' for showdate in dates:\r\n',
213 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
214 ' if len(entries)>=SHOWAMOUNT:\r\n',
215 ' break\r\n',
216 ' \r\n',
217 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000218 stream = io.BytesIO("".join(s).encode(self.encoding))
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000219 reader = codecs.getreader(self.encoding)(stream)
220 for (i, line) in enumerate(reader):
221 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000222
223 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000224 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000225 writer = codecs.getwriter(self.encoding)(q)
226 reader = codecs.getreader(self.encoding)(q)
227
228 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000229 writer.write("foo\r")
230 self.assertEqual(reader.readline(keepends=False), "foo")
231 writer.write("\nbar\r")
232 self.assertEqual(reader.readline(keepends=False), "")
233 self.assertEqual(reader.readline(keepends=False), "bar")
234 writer.write("baz")
235 self.assertEqual(reader.readline(keepends=False), "baz")
236 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000237
238 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000239 writer.write("foo\r")
240 self.assertEqual(reader.readline(keepends=True), "foo\r")
241 writer.write("\nbar\r")
242 self.assertEqual(reader.readline(keepends=True), "\n")
243 self.assertEqual(reader.readline(keepends=True), "bar\r")
244 writer.write("baz")
245 self.assertEqual(reader.readline(keepends=True), "baz")
246 self.assertEqual(reader.readline(keepends=True), "")
247 writer.write("foo\r\n")
248 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000249
Walter Dörwald9fa09462005-01-10 12:01:39 +0000250 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000251 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
252 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
253 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000254
255 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000256 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000257 reader = codecs.getreader(self.encoding)(stream)
258 self.assertEqual(reader.readline(), s1)
259 self.assertEqual(reader.readline(), s2)
260 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000261 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000262
263 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000264 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
265 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
266 s3 = "stillokay:bbbbxx\r\n"
267 s4 = "broken!!!!badbad\r\n"
268 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000269
270 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000271 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000272 reader = codecs.getreader(self.encoding)(stream)
273 self.assertEqual(reader.readline(), s1)
274 self.assertEqual(reader.readline(), s2)
275 self.assertEqual(reader.readline(), s3)
276 self.assertEqual(reader.readline(), s4)
277 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000278 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000279
Walter Dörwald41980ca2007-08-16 21:55:45 +0000280class UTF32Test(ReadTest):
281 encoding = "utf-32"
282
283 spamle = (b'\xff\xfe\x00\x00'
284 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
285 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
286 spambe = (b'\x00\x00\xfe\xff'
287 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
288 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
289
290 def test_only_one_bom(self):
291 _,_,reader,writer = codecs.lookup(self.encoding)
292 # encode some stream
293 s = io.BytesIO()
294 f = writer(s)
295 f.write("spam")
296 f.write("spam")
297 d = s.getvalue()
298 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000299 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000300 # try to read it back
301 s = io.BytesIO(d)
302 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000303 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000304
305 def test_badbom(self):
306 s = io.BytesIO(4*b"\xff")
307 f = codecs.getreader(self.encoding)(s)
308 self.assertRaises(UnicodeError, f.read)
309
310 s = io.BytesIO(8*b"\xff")
311 f = codecs.getreader(self.encoding)(s)
312 self.assertRaises(UnicodeError, f.read)
313
314 def test_partial(self):
315 self.check_partial(
316 "\x00\xff\u0100\uffff",
317 [
318 "", # first byte of BOM read
319 "", # second byte of BOM read
320 "", # third byte of BOM read
321 "", # fourth byte of BOM read => byteorder known
322 "",
323 "",
324 "",
325 "\x00",
326 "\x00",
327 "\x00",
328 "\x00",
329 "\x00\xff",
330 "\x00\xff",
331 "\x00\xff",
332 "\x00\xff",
333 "\x00\xff\u0100",
334 "\x00\xff\u0100",
335 "\x00\xff\u0100",
336 "\x00\xff\u0100",
337 "\x00\xff\u0100\uffff",
338 ]
339 )
340
Georg Brandl791f4e12009-09-17 11:41:24 +0000341 def test_handlers(self):
342 self.assertEqual(('\ufffd', 1),
343 codecs.utf_32_decode(b'\x01', 'replace', True))
344 self.assertEqual(('', 1),
345 codecs.utf_32_decode(b'\x01', 'ignore', True))
346
Walter Dörwald41980ca2007-08-16 21:55:45 +0000347 def test_errors(self):
348 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
349 b"\xff", "strict", True)
350
351 def test_decoder_state(self):
352 self.check_state_handling_decode(self.encoding,
353 "spamspam", self.spamle)
354 self.check_state_handling_decode(self.encoding,
355 "spamspam", self.spambe)
356
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000357 def test_issue8941(self):
358 # Issue #8941: insufficient result allocation when decoding into
359 # surrogate pairs on UCS-2 builds.
360 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
361 self.assertEqual('\U00010000' * 1024,
362 codecs.utf_32_decode(encoded_le)[0])
363 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
364 self.assertEqual('\U00010000' * 1024,
365 codecs.utf_32_decode(encoded_be)[0])
366
Walter Dörwald41980ca2007-08-16 21:55:45 +0000367class UTF32LETest(ReadTest):
368 encoding = "utf-32-le"
369
370 def test_partial(self):
371 self.check_partial(
372 "\x00\xff\u0100\uffff",
373 [
374 "",
375 "",
376 "",
377 "\x00",
378 "\x00",
379 "\x00",
380 "\x00",
381 "\x00\xff",
382 "\x00\xff",
383 "\x00\xff",
384 "\x00\xff",
385 "\x00\xff\u0100",
386 "\x00\xff\u0100",
387 "\x00\xff\u0100",
388 "\x00\xff\u0100",
389 "\x00\xff\u0100\uffff",
390 ]
391 )
392
393 def test_simple(self):
394 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
395
396 def test_errors(self):
397 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
398 b"\xff", "strict", True)
399
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000400 def test_issue8941(self):
401 # Issue #8941: insufficient result allocation when decoding into
402 # surrogate pairs on UCS-2 builds.
403 encoded = b'\x00\x00\x01\x00' * 1024
404 self.assertEqual('\U00010000' * 1024,
405 codecs.utf_32_le_decode(encoded)[0])
406
Walter Dörwald41980ca2007-08-16 21:55:45 +0000407class UTF32BETest(ReadTest):
408 encoding = "utf-32-be"
409
410 def test_partial(self):
411 self.check_partial(
412 "\x00\xff\u0100\uffff",
413 [
414 "",
415 "",
416 "",
417 "\x00",
418 "\x00",
419 "\x00",
420 "\x00",
421 "\x00\xff",
422 "\x00\xff",
423 "\x00\xff",
424 "\x00\xff",
425 "\x00\xff\u0100",
426 "\x00\xff\u0100",
427 "\x00\xff\u0100",
428 "\x00\xff\u0100",
429 "\x00\xff\u0100\uffff",
430 ]
431 )
432
433 def test_simple(self):
434 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
435
436 def test_errors(self):
437 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
438 b"\xff", "strict", True)
439
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000440 def test_issue8941(self):
441 # Issue #8941: insufficient result allocation when decoding into
442 # surrogate pairs on UCS-2 builds.
443 encoded = b'\x00\x01\x00\x00' * 1024
444 self.assertEqual('\U00010000' * 1024,
445 codecs.utf_32_be_decode(encoded)[0])
446
447
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000448class UTF16Test(ReadTest):
449 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000450
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000451 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
452 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000453
454 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000455 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000456 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000457 s = io.BytesIO()
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000458 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000459 f.write("spam")
460 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000461 d = s.getvalue()
462 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000463 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000464 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000465 s = io.BytesIO(d)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000466 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000467 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000468
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000469 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000470 s = io.BytesIO(b"\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000471 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000472 self.assertRaises(UnicodeError, f.read)
473
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000474 s = io.BytesIO(b"\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000475 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000476 self.assertRaises(UnicodeError, f.read)
477
Walter Dörwald69652032004-09-07 20:24:22 +0000478 def test_partial(self):
479 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000480 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000481 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000482 "", # first byte of BOM read
483 "", # second byte of BOM read => byteorder known
484 "",
485 "\x00",
486 "\x00",
487 "\x00\xff",
488 "\x00\xff",
489 "\x00\xff\u0100",
490 "\x00\xff\u0100",
491 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000492 ]
493 )
494
Georg Brandl791f4e12009-09-17 11:41:24 +0000495 def test_handlers(self):
496 self.assertEqual(('\ufffd', 1),
497 codecs.utf_16_decode(b'\x01', 'replace', True))
498 self.assertEqual(('', 1),
499 codecs.utf_16_decode(b'\x01', 'ignore', True))
500
Walter Dörwalde22d3392005-11-17 08:52:34 +0000501 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000502 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000503 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000504
505 def test_decoder_state(self):
506 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000507 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000508 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000509 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000510
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000511 def test_bug691291(self):
512 # Files are always opened in binary mode, even if no binary mode was
513 # specified. This means that no automatic conversion of '\n' is done
514 # on reading and writing.
515 s1 = 'Hello\r\nworld\r\n'
516
517 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200518 self.addCleanup(support.unlink, support.TESTFN)
519 with open(support.TESTFN, 'wb') as fp:
520 fp.write(s)
521 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
522 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000523
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000524class UTF16LETest(ReadTest):
525 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000526
527 def test_partial(self):
528 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000529 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000530 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000531 "",
532 "\x00",
533 "\x00",
534 "\x00\xff",
535 "\x00\xff",
536 "\x00\xff\u0100",
537 "\x00\xff\u0100",
538 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000539 ]
540 )
541
Walter Dörwalde22d3392005-11-17 08:52:34 +0000542 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200543 tests = [
544 (b'\xff', '\ufffd'),
545 (b'A\x00Z', 'A\ufffd'),
546 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
547 (b'\x00\xd8', '\ufffd'),
548 (b'\x00\xd8A', '\ufffd'),
549 (b'\x00\xd8A\x00', '\ufffdA'),
550 (b'\x00\xdcA\x00', '\ufffdA'),
551 ]
552 for raw, expected in tests:
553 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
554 raw, 'strict', True)
555 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000556
Victor Stinner53a9dd72010-12-08 22:25:45 +0000557 def test_nonbmp(self):
558 self.assertEqual("\U00010203".encode(self.encoding),
559 b'\x00\xd8\x03\xde')
560 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
561 "\U00010203")
562
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000563class UTF16BETest(ReadTest):
564 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000565
566 def test_partial(self):
567 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000568 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000569 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000570 "",
571 "\x00",
572 "\x00",
573 "\x00\xff",
574 "\x00\xff",
575 "\x00\xff\u0100",
576 "\x00\xff\u0100",
577 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000578 ]
579 )
580
Walter Dörwalde22d3392005-11-17 08:52:34 +0000581 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200582 tests = [
583 (b'\xff', '\ufffd'),
584 (b'\x00A\xff', 'A\ufffd'),
585 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
586 (b'\xd8\x00', '\ufffd'),
587 (b'\xd8\x00\xdc', '\ufffd'),
588 (b'\xd8\x00\x00A', '\ufffdA'),
589 (b'\xdc\x00\x00A', '\ufffdA'),
590 ]
591 for raw, expected in tests:
592 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
593 raw, 'strict', True)
594 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000595
Victor Stinner53a9dd72010-12-08 22:25:45 +0000596 def test_nonbmp(self):
597 self.assertEqual("\U00010203".encode(self.encoding),
598 b'\xd8\x00\xde\x03')
599 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
600 "\U00010203")
601
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000602class UTF8Test(ReadTest):
603 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000604
605 def test_partial(self):
606 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000607 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000608 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000609 "\x00",
610 "\x00",
611 "\x00\xff",
612 "\x00\xff",
613 "\x00\xff\u07ff",
614 "\x00\xff\u07ff",
615 "\x00\xff\u07ff",
616 "\x00\xff\u07ff\u0800",
617 "\x00\xff\u07ff\u0800",
618 "\x00\xff\u07ff\u0800",
619 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000620 ]
621 )
622
Walter Dörwald3abcb012007-04-16 22:10:50 +0000623 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000624 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000625 self.check_state_handling_decode(self.encoding,
626 u, u.encode(self.encoding))
627
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000628 def test_lone_surrogates(self):
629 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
630 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000631 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
632 b'[\\udc80]')
633 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
634 b'[&#56448;]')
635 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
636 b'[\x80]')
637 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
638 b'[]')
639 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
640 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000641
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000642 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000643 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
644 b"abc\xed\xa0\x80def")
645 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
646 "abc\ud800def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000647 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700648 with self.assertRaises(UnicodeDecodeError):
649 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000650
Walter Dörwalde22d3392005-11-17 08:52:34 +0000651class UTF7Test(ReadTest):
652 encoding = "utf-7"
653
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000654 def test_partial(self):
655 self.check_partial(
656 "a+-b",
657 [
658 "a",
659 "a",
660 "a+",
661 "a+-",
662 "a+-b",
663 ]
664 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000665
666class UTF16ExTest(unittest.TestCase):
667
668 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000669 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000670
671 def test_bad_args(self):
672 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
673
674class ReadBufferTest(unittest.TestCase):
675
676 def test_array(self):
677 import array
678 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000679 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000680 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000681 )
682
683 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000684 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000685
686 def test_bad_args(self):
687 self.assertRaises(TypeError, codecs.readbuffer_encode)
688 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
689
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000690class UTF8SigTest(ReadTest):
691 encoding = "utf-8-sig"
692
693 def test_partial(self):
694 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000695 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000696 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000697 "",
698 "",
699 "", # First BOM has been read and skipped
700 "",
701 "",
702 "\ufeff", # Second BOM has been read and emitted
703 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000704 "\ufeff\x00", # First byte of encoded "\xff" read
705 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
706 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
707 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000708 "\ufeff\x00\xff\u07ff",
709 "\ufeff\x00\xff\u07ff",
710 "\ufeff\x00\xff\u07ff\u0800",
711 "\ufeff\x00\xff\u07ff\u0800",
712 "\ufeff\x00\xff\u07ff\u0800",
713 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000714 ]
715 )
716
Thomas Wouters89f507f2006-12-13 04:49:30 +0000717 def test_bug1601501(self):
718 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000719 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000720
Walter Dörwald3abcb012007-04-16 22:10:50 +0000721 def test_bom(self):
722 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000723 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000724 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
725
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000726 def test_stream_bom(self):
727 unistring = "ABC\u00A1\u2200XYZ"
728 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
729
730 reader = codecs.getreader("utf-8-sig")
731 for sizehint in [None] + list(range(1, 11)) + \
732 [64, 128, 256, 512, 1024]:
733 istream = reader(io.BytesIO(bytestring))
734 ostream = io.StringIO()
735 while 1:
736 if sizehint is not None:
737 data = istream.read(sizehint)
738 else:
739 data = istream.read()
740
741 if not data:
742 break
743 ostream.write(data)
744
745 got = ostream.getvalue()
746 self.assertEqual(got, unistring)
747
748 def test_stream_bare(self):
749 unistring = "ABC\u00A1\u2200XYZ"
750 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
751
752 reader = codecs.getreader("utf-8-sig")
753 for sizehint in [None] + list(range(1, 11)) + \
754 [64, 128, 256, 512, 1024]:
755 istream = reader(io.BytesIO(bytestring))
756 ostream = io.StringIO()
757 while 1:
758 if sizehint is not None:
759 data = istream.read(sizehint)
760 else:
761 data = istream.read()
762
763 if not data:
764 break
765 ostream.write(data)
766
767 got = ostream.getvalue()
768 self.assertEqual(got, unistring)
769
770class EscapeDecodeTest(unittest.TestCase):
771 def test_empty(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000772 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000773
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000774class RecodingTest(unittest.TestCase):
775 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000776 f = io.BytesIO()
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000777 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000778 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000779 f2.close()
780 # Python used to crash on this at exit because of a refcount
781 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000782
Martin v. Löwis2548c732003-04-18 10:39:54 +0000783# From RFC 3492
784punycode_testcases = [
785 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000786 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
787 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000788 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000789 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000790 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000791 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000792 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000793 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000794 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000795 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000796 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
797 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
798 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000799 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000800 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000801 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
802 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
803 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000804 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000805 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000806 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000807 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
808 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
809 "\u0939\u0948\u0902",
810 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000811
812 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000813 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000814 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
815 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000816
817 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000818 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
819 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
820 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000821 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
822 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000823
824 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000825 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
826 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
827 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
828 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000829 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000830
831 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000832 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
833 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
834 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
835 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
836 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000837 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000838
839 # (K) Vietnamese:
840 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
841 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000842 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
843 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
844 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
845 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000846 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000847
Martin v. Löwis2548c732003-04-18 10:39:54 +0000848 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000849 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000850 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000851
Martin v. Löwis2548c732003-04-18 10:39:54 +0000852 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000853 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
854 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
855 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000856 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000857
858 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000859 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
860 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
861 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000862 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000863
864 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000865 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000866 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000867
868 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000869 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
870 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000871 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000872
873 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000874 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000875 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000876
877 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000878 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000879 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000880
881 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000882 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
883 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000884 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000885 ]
886
887for i in punycode_testcases:
888 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000889 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000890
891class PunycodeTest(unittest.TestCase):
892 def test_encode(self):
893 for uni, puny in punycode_testcases:
894 # Need to convert both strings to lower case, since
895 # some of the extended encodings use upper case, but our
896 # code produces only lower case. Converting just puny to
897 # lower is also insufficient, since some of the input characters
898 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +0000899 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +0000900 str(uni.encode("punycode"), "ascii").lower(),
901 str(puny, "ascii").lower()
902 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000903
904 def test_decode(self):
905 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000906 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000907 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000908 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000909
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000910class UnicodeInternalTest(unittest.TestCase):
911 def test_bug1251300(self):
912 # Decoding with unicode_internal used to not correctly handle "code
913 # points" above 0x10ffff on UCS-4 builds.
914 if sys.maxunicode > 0xffff:
915 ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000916 (b"\x00\x10\xff\xff", "\U0010ffff"),
917 (b"\x00\x00\x01\x01", "\U00000101"),
918 (b"", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000919 ]
920 not_ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000921 b"\x7f\xff\xff\xff",
922 b"\x80\x00\x00\x00",
923 b"\x81\x00\x00\x00",
924 b"\x00",
925 b"\x00\x00\x00\x00\x00",
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000926 ]
927 for internal, uni in ok:
928 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000929 internal = bytes(reversed(internal))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000930 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000931 for internal in not_ok:
932 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000933 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000934 self.assertRaises(UnicodeDecodeError, internal.decode,
935 "unicode_internal")
936
937 def test_decode_error_attributes(self):
938 if sys.maxunicode > 0xffff:
939 try:
Walter Dörwald092a2252007-06-07 11:26:16 +0000940 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +0000941 except UnicodeDecodeError as ex:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000942 self.assertEqual("unicode_internal", ex.encoding)
943 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
944 self.assertEqual(4, ex.start)
945 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000946 else:
947 self.fail()
948
949 def test_decode_callback(self):
950 if sys.maxunicode > 0xffff:
951 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
952 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossum98297ee2007-11-06 21:34:58 +0000953 ab = "ab".encode("unicode_internal").decode()
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000954 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
955 "ascii"),
956 "UnicodeInternalTest")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000957 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000958
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000959 def test_encode_length(self):
960 # Issue 3739
961 encoder = codecs.getencoder("unicode_internal")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000962 self.assertEqual(encoder("a")[1], 1)
963 self.assertEqual(encoder("\xe9\u0142")[1], 2)
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000964
Ezio Melottib3aedd42010-11-20 19:04:17 +0000965 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +0000966
Martin v. Löwis2548c732003-04-18 10:39:54 +0000967# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
968nameprep_tests = [
969 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000970 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
971 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
972 b'\xb8\x8f\xef\xbb\xbf',
973 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000974 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000975 (b'CAFE',
976 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000977 # 3.3 Case folding 8bit U+00DF (german sharp s).
978 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000979 (b'\xc3\x9f',
980 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000981 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000982 (b'\xc4\xb0',
983 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000984 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000985 (b'\xc5\x83\xcd\xba',
986 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000987 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
988 # XXX: skip this as it fails in UCS-2 mode
989 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
990 # 'telc\xe2\x88\x95kg\xcf\x83'),
991 (None, None),
992 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000993 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
994 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000995 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000996 (b'\xe1\xbe\xb7',
997 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000998 # 3.9 Self-reverting case folding U+01F0 and normalization.
999 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001000 (b'\xc7\xb0',
1001 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001002 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001003 (b'\xce\x90',
1004 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001005 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001006 (b'\xce\xb0',
1007 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001008 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001009 (b'\xe1\xba\x96',
1010 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001011 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001012 (b'\xe1\xbd\x96',
1013 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001014 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001015 (b' ',
1016 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001017 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001018 (b'\xc2\xa0',
1019 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001020 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001021 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001022 None),
1023 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001024 (b'\xe2\x80\x80',
1025 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001026 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001027 (b'\xe2\x80\x8b',
1028 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001029 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001030 (b'\xe3\x80\x80',
1031 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001032 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001033 (b'\x10\x7f',
1034 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001035 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001036 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001037 None),
1038 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001039 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001040 None),
1041 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001042 (b'\xef\xbb\xbf',
1043 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001044 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001045 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001046 None),
1047 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001048 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001049 None),
1050 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001051 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001052 None),
1053 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001054 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001055 None),
1056 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001057 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001058 None),
1059 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001060 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001061 None),
1062 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001063 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001064 None),
1065 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001066 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001067 None),
1068 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001069 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001070 None),
1071 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001072 (b'\xcd\x81',
1073 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001074 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001075 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001076 None),
1077 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001078 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001079 None),
1080 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001081 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001082 None),
1083 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001084 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001085 None),
1086 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001087 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001088 None),
1089 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001090 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001091 None),
1092 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001093 (b'foo\xef\xb9\xb6bar',
1094 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001095 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001096 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001097 None),
1098 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001099 (b'\xd8\xa71\xd8\xa8',
1100 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001101 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001102 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001103 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001104 # None),
1105 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001106 # 3.44 Larger test (shrinking).
1107 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001108 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1109 b'\xaa\xce\xb0\xe2\x80\x80',
1110 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001111 # 3.45 Larger test (expanding).
1112 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001113 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1114 b'\x80',
1115 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1116 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1117 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001118 ]
1119
1120
1121class NameprepTest(unittest.TestCase):
1122 def test_nameprep(self):
1123 from encodings.idna import nameprep
1124 for pos, (orig, prepped) in enumerate(nameprep_tests):
1125 if orig is None:
1126 # Skipped
1127 continue
1128 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001129 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001130 if prepped is None:
1131 # Input contains prohibited characters
1132 self.assertRaises(UnicodeError, nameprep, orig)
1133 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001134 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001135 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001136 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001137 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001138 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001139
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001140class IDNACodecTest(unittest.TestCase):
1141 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001142 self.assertEqual(str(b"python.org", "idna"), "python.org")
1143 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1144 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1145 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001146
1147 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001148 self.assertEqual("python.org".encode("idna"), b"python.org")
1149 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1150 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1151 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001152
Martin v. Löwis8b595142005-08-25 11:03:38 +00001153 def test_stream(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001154 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001155 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001156 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001157
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001158 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001159 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001160 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001161 "python.org"
1162 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001163 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001164 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001165 "python.org."
1166 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001167 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001168 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001169 "pyth\xf6n.org."
1170 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001171 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001172 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001173 "pyth\xf6n.org."
1174 )
1175
1176 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001177 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1178 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1179 self.assertEqual(decoder.decode(b"rg"), "")
1180 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001181
1182 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001183 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1184 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1185 self.assertEqual(decoder.decode(b"rg."), "org.")
1186 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001187
1188 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001189 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001190 b"".join(codecs.iterencode("python.org", "idna")),
1191 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001192 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001193 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001194 b"".join(codecs.iterencode("python.org.", "idna")),
1195 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001196 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001197 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001198 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1199 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001200 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001201 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001202 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1203 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001204 )
1205
1206 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001207 self.assertEqual(encoder.encode("\xe4x"), b"")
1208 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1209 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001210
1211 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001212 self.assertEqual(encoder.encode("\xe4x"), b"")
1213 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1214 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001215
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001216class CodecsModuleTest(unittest.TestCase):
1217
1218 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001219 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1220 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001221 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001222 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001223 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001224
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001225 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001226 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1227 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001228 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001229 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001230 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001231 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001232
1233 def test_register(self):
1234 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001235 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001236
1237 def test_lookup(self):
1238 self.assertRaises(TypeError, codecs.lookup)
1239 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001240 self.assertRaises(LookupError, codecs.lookup, " ")
1241
1242 def test_getencoder(self):
1243 self.assertRaises(TypeError, codecs.getencoder)
1244 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1245
1246 def test_getdecoder(self):
1247 self.assertRaises(TypeError, codecs.getdecoder)
1248 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1249
1250 def test_getreader(self):
1251 self.assertRaises(TypeError, codecs.getreader)
1252 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1253
1254 def test_getwriter(self):
1255 self.assertRaises(TypeError, codecs.getwriter)
1256 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001257
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001258 def test_lookup_issue1813(self):
1259 # Issue #1813: under Turkish locales, lookup of some codecs failed
1260 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitrou2a20f9b2011-07-27 01:06:07 +02001261 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001262 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1263 try:
1264 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1265 except locale.Error:
1266 # Unsupported locale on this system
1267 self.skipTest('test needs Turkish locale')
1268 c = codecs.lookup('ASCII')
1269 self.assertEqual(c.name, 'ascii')
1270
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001271class StreamReaderTest(unittest.TestCase):
1272
1273 def setUp(self):
1274 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001275 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001276
1277 def test_readlines(self):
1278 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001279 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001280
Thomas Wouters89f507f2006-12-13 04:49:30 +00001281class EncodedFileTest(unittest.TestCase):
1282
1283 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001284 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001285 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001286 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001287
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001288 f = io.BytesIO()
Thomas Wouters89f507f2006-12-13 04:49:30 +00001289 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001290 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001291 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001292
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001293all_unicode_encodings = [
1294 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001295 "big5",
1296 "big5hkscs",
1297 "charmap",
1298 "cp037",
1299 "cp1006",
1300 "cp1026",
1301 "cp1140",
1302 "cp1250",
1303 "cp1251",
1304 "cp1252",
1305 "cp1253",
1306 "cp1254",
1307 "cp1255",
1308 "cp1256",
1309 "cp1257",
1310 "cp1258",
1311 "cp424",
1312 "cp437",
1313 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001314 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001315 "cp737",
1316 "cp775",
1317 "cp850",
1318 "cp852",
1319 "cp855",
1320 "cp856",
1321 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001322 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001323 "cp860",
1324 "cp861",
1325 "cp862",
1326 "cp863",
1327 "cp864",
1328 "cp865",
1329 "cp866",
1330 "cp869",
1331 "cp874",
1332 "cp875",
1333 "cp932",
1334 "cp949",
1335 "cp950",
1336 "euc_jis_2004",
1337 "euc_jisx0213",
1338 "euc_jp",
1339 "euc_kr",
1340 "gb18030",
1341 "gb2312",
1342 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001343 "hp_roman8",
1344 "hz",
1345 "idna",
1346 "iso2022_jp",
1347 "iso2022_jp_1",
1348 "iso2022_jp_2",
1349 "iso2022_jp_2004",
1350 "iso2022_jp_3",
1351 "iso2022_jp_ext",
1352 "iso2022_kr",
1353 "iso8859_1",
1354 "iso8859_10",
1355 "iso8859_11",
1356 "iso8859_13",
1357 "iso8859_14",
1358 "iso8859_15",
1359 "iso8859_16",
1360 "iso8859_2",
1361 "iso8859_3",
1362 "iso8859_4",
1363 "iso8859_5",
1364 "iso8859_6",
1365 "iso8859_7",
1366 "iso8859_8",
1367 "iso8859_9",
1368 "johab",
1369 "koi8_r",
1370 "koi8_u",
1371 "latin_1",
1372 "mac_cyrillic",
1373 "mac_greek",
1374 "mac_iceland",
1375 "mac_latin2",
1376 "mac_roman",
1377 "mac_turkish",
1378 "palmos",
1379 "ptcp154",
1380 "punycode",
1381 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001382 "shift_jis",
1383 "shift_jis_2004",
1384 "shift_jisx0213",
1385 "tis_620",
1386 "unicode_escape",
1387 "unicode_internal",
1388 "utf_16",
1389 "utf_16_be",
1390 "utf_16_le",
1391 "utf_7",
1392 "utf_8",
1393]
1394
1395if hasattr(codecs, "mbcs_encode"):
1396 all_unicode_encodings.append("mbcs")
1397
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001398# The following encoding is not tested, because it's not supposed
1399# to work:
1400# "undefined"
1401
1402# The following encodings don't work in stateful mode
1403broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001404 "punycode",
1405 "unicode_internal"
1406]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001407broken_incremental_coders = broken_unicode_with_streams + [
1408 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001409]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001410
Walter Dörwald3abcb012007-04-16 22:10:50 +00001411class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001412 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001413 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001414 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001415 name = codecs.lookup(encoding).name
1416 if encoding.endswith("_codec"):
1417 name += "_codec"
1418 elif encoding == "latin_1":
1419 name = "latin_1"
1420 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001421 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001422 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001423 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001424 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1425
1426 if encoding not in broken_unicode_with_streams:
1427 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001428 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001429 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001430 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001431 for c in s:
1432 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001433 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001434 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001435 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001436 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001437 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001438 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001439 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001440 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001441 decodedresult += reader.read()
1442 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1443
Thomas Wouters89f507f2006-12-13 04:49:30 +00001444 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001445 # check incremental decoder/encoder (fetched via the Python
1446 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001447 try:
1448 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001449 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001450 except LookupError: # no IncrementalEncoder
1451 pass
1452 else:
1453 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001454 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001455 for c in s:
1456 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001457 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001458 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001459 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001460 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001461 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001462 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001463 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1464
1465 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001466 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001467 for c in s:
1468 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001469 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001470 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001471 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001472 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001473 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001474 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001475 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1476
1477 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001478 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001479 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1480
1481 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001482 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1483 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001484
Victor Stinner554f3f02010-06-16 23:33:54 +00001485 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001486 # check incremental decoder/encoder with errors argument
1487 try:
1488 encoder = codecs.getincrementalencoder(encoding)("ignore")
1489 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1490 except LookupError: # no IncrementalEncoder
1491 pass
1492 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001493 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001494 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001495 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001496 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1497
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001498 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001499 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001500 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001501 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1502
Walter Dörwald729c31f2005-03-14 19:06:30 +00001503 def test_seek(self):
1504 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001505 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001506 for encoding in all_unicode_encodings:
1507 if encoding == "idna": # FIXME: See SF bug #1163178
1508 continue
1509 if encoding in broken_unicode_with_streams:
1510 continue
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001511 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001512 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001513 # Test that calling seek resets the internal codec state and buffers
1514 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001515 data = reader.read()
1516 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001517
Walter Dörwalde22d3392005-11-17 08:52:34 +00001518 def test_bad_decode_args(self):
1519 for encoding in all_unicode_encodings:
1520 decoder = codecs.getdecoder(encoding)
1521 self.assertRaises(TypeError, decoder)
1522 if encoding not in ("idna", "punycode"):
1523 self.assertRaises(TypeError, decoder, 42)
1524
1525 def test_bad_encode_args(self):
1526 for encoding in all_unicode_encodings:
1527 encoder = codecs.getencoder(encoding)
1528 self.assertRaises(TypeError, encoder)
1529
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001530 def test_encoding_map_type_initialized(self):
1531 from encodings import cp1140
1532 # This used to crash, we are only verifying there's no crash.
1533 table_type = type(cp1140.encoding_table)
1534 self.assertEqual(table_type, table_type)
1535
Walter Dörwald3abcb012007-04-16 22:10:50 +00001536 def test_decoder_state(self):
1537 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001538 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001539 for encoding in all_unicode_encodings:
1540 if encoding not in broken_incremental_coders:
1541 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1542 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1543
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001544class CharmapTest(unittest.TestCase):
1545 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001546 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001547 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001548 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001549 )
1550
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001551 self.assertRaises(UnicodeDecodeError,
1552 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1553 )
1554
Ezio Melottib3aedd42010-11-20 19:04:17 +00001555 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001556 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001557 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001558 )
1559
Ezio Melottib3aedd42010-11-20 19:04:17 +00001560 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001561 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001562 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001563 )
1564
Ezio Melottib3aedd42010-11-20 19:04:17 +00001565 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001566 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001567 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001568 )
1569
Ezio Melottib3aedd42010-11-20 19:04:17 +00001570 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001571 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001572 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001573 )
1574
Guido van Rossum805365e2007-05-07 22:24:25 +00001575 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001576 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001577 codecs.charmap_decode(allbytes, "ignore", ""),
1578 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001579 )
1580
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001581 def test_decode_with_int2str_map(self):
1582 self.assertEqual(
1583 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1584 {0: 'a', 1: 'b', 2: 'c'}),
1585 ("abc", 3)
1586 )
1587
1588 self.assertEqual(
1589 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1590 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1591 ("AaBbCc", 3)
1592 )
1593
1594 self.assertEqual(
1595 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1596 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1597 ("\U0010FFFFbc", 3)
1598 )
1599
1600 self.assertEqual(
1601 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1602 {0: 'a', 1: 'b', 2: ''}),
1603 ("ab", 3)
1604 )
1605
1606 self.assertRaises(UnicodeDecodeError,
1607 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1608 {0: 'a', 1: 'b'}
1609 )
1610
1611 self.assertEqual(
1612 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1613 {0: 'a', 1: 'b'}),
1614 ("ab\ufffd", 3)
1615 )
1616
1617 self.assertEqual(
1618 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1619 {0: 'a', 1: 'b', 2: None}),
1620 ("ab\ufffd", 3)
1621 )
1622
1623 self.assertEqual(
1624 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1625 {0: 'a', 1: 'b'}),
1626 ("ab", 3)
1627 )
1628
1629 self.assertEqual(
1630 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1631 {0: 'a', 1: 'b', 2: None}),
1632 ("ab", 3)
1633 )
1634
1635 allbytes = bytes(range(256))
1636 self.assertEqual(
1637 codecs.charmap_decode(allbytes, "ignore", {}),
1638 ("", len(allbytes))
1639 )
1640
1641 def test_decode_with_int2int_map(self):
1642 a = ord('a')
1643 b = ord('b')
1644 c = ord('c')
1645
1646 self.assertEqual(
1647 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1648 {0: a, 1: b, 2: c}),
1649 ("abc", 3)
1650 )
1651
1652 # Issue #15379
1653 self.assertEqual(
1654 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1655 {0: 0x10FFFF, 1: b, 2: c}),
1656 ("\U0010FFFFbc", 3)
1657 )
1658
1659 self.assertRaises(TypeError,
1660 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1661 {0: 0x110000, 1: b, 2: c}
1662 )
1663
1664 self.assertRaises(UnicodeDecodeError,
1665 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1666 {0: a, 1: b},
1667 )
1668
1669 self.assertEqual(
1670 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1671 {0: a, 1: b}),
1672 ("ab\ufffd", 3)
1673 )
1674
1675 self.assertEqual(
1676 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1677 {0: a, 1: b}),
1678 ("ab", 3)
1679 )
1680
1681
Thomas Wouters89f507f2006-12-13 04:49:30 +00001682class WithStmtTest(unittest.TestCase):
1683 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001684 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001685 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001686 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001687
1688 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001689 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001690 info = codecs.lookup("utf-8")
1691 with codecs.StreamReaderWriter(f, info.streamreader,
1692 info.streamwriter, 'strict') as srw:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001693 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001694
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001695class TypesTest(unittest.TestCase):
1696 def test_decode_unicode(self):
1697 # Most decoders don't accept unicode input
1698 decoders = [
1699 codecs.utf_7_decode,
1700 codecs.utf_8_decode,
1701 codecs.utf_16_le_decode,
1702 codecs.utf_16_be_decode,
1703 codecs.utf_16_ex_decode,
1704 codecs.utf_32_decode,
1705 codecs.utf_32_le_decode,
1706 codecs.utf_32_be_decode,
1707 codecs.utf_32_ex_decode,
1708 codecs.latin_1_decode,
1709 codecs.ascii_decode,
1710 codecs.charmap_decode,
1711 ]
1712 if hasattr(codecs, "mbcs_decode"):
1713 decoders.append(codecs.mbcs_decode)
1714 for decoder in decoders:
1715 self.assertRaises(TypeError, decoder, "xxx")
1716
1717 def test_unicode_escape(self):
1718 # Escape-decoding an unicode string is supported ang gives the same
1719 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001720 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1721 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1722 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1723 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001724
Martin v. Löwis43c57782009-05-10 08:15:24 +00001725class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001726
1727 def test_utf8(self):
1728 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001729 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001730 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001731 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001732 b"foo\x80bar")
1733 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001734 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001735 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001736 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001737 b"\xed\xb0\x80")
1738
1739 def test_ascii(self):
1740 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001741 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001742 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001743 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001744 b"foo\x80bar")
1745
1746 def test_charmap(self):
1747 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001748 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001749 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001750 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001751 b"foo\xa5bar")
1752
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001753 def test_latin1(self):
1754 # Issue6373
1755 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin1", "surrogateescape"),
1756 b"\xe4\xeb\xef\xf6\xfc")
1757
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001758
Victor Stinner3fed0872010-05-22 02:16:27 +00001759class BomTest(unittest.TestCase):
1760 def test_seek0(self):
1761 data = "1234567890"
1762 tests = ("utf-16",
1763 "utf-16-le",
1764 "utf-16-be",
1765 "utf-32",
1766 "utf-32-le",
1767 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02001768 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00001769 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001770 # Check if the BOM is written only once
1771 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001772 f.write(data)
1773 f.write(data)
1774 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001775 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001776 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001777 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001778
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001779 # Check that the BOM is written after a seek(0)
1780 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1781 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001782 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001783 f.seek(0)
1784 f.write(data)
1785 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001786 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001787
1788 # (StreamWriter) Check that the BOM is written after a seek(0)
1789 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1790 f.writer.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001791 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001792 f.writer.seek(0)
1793 f.writer.write(data)
1794 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001795 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001796
1797 # Check that the BOM is not written after a seek() at a position
1798 # different than the start
1799 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1800 f.write(data)
1801 f.seek(f.tell())
1802 f.write(data)
1803 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001804 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001805
1806 # (StreamWriter) Check that the BOM is not written after a seek()
1807 # at a position different than the start
1808 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1809 f.writer.write(data)
1810 f.writer.seek(f.writer.tell())
1811 f.writer.write(data)
1812 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001813 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001814
Victor Stinner3fed0872010-05-22 02:16:27 +00001815
Georg Brandl02524622010-12-02 18:06:51 +00001816bytes_transform_encodings = [
1817 "base64_codec",
1818 "uu_codec",
1819 "quopri_codec",
1820 "hex_codec",
1821]
1822try:
1823 import zlib
1824except ImportError:
1825 pass
1826else:
1827 bytes_transform_encodings.append("zlib_codec")
1828try:
1829 import bz2
1830except ImportError:
1831 pass
1832else:
1833 bytes_transform_encodings.append("bz2_codec")
1834
1835class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001836
Georg Brandl02524622010-12-02 18:06:51 +00001837 def test_basics(self):
1838 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00001839 for encoding in bytes_transform_encodings:
1840 # generic codecs interface
1841 (o, size) = codecs.getencoder(encoding)(binput)
1842 self.assertEqual(size, len(binput))
1843 (i, size) = codecs.getdecoder(encoding)(o)
1844 self.assertEqual(size, len(o))
1845 self.assertEqual(i, binput)
1846
Georg Brandl02524622010-12-02 18:06:51 +00001847 def test_read(self):
1848 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001849 sin = codecs.encode(b"\x80", encoding)
Georg Brandl02524622010-12-02 18:06:51 +00001850 reader = codecs.getreader(encoding)(io.BytesIO(sin))
1851 sout = reader.read()
1852 self.assertEqual(sout, b"\x80")
1853
1854 def test_readline(self):
1855 for encoding in bytes_transform_encodings:
1856 if encoding in ['uu_codec', 'zlib_codec']:
1857 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001858 sin = codecs.encode(b"\x80", encoding)
Georg Brandl02524622010-12-02 18:06:51 +00001859 reader = codecs.getreader(encoding)(io.BytesIO(sin))
1860 sout = reader.readline()
1861 self.assertEqual(sout, b"\x80")
1862
1863
Fred Drake2e2be372001-09-20 21:33:42 +00001864def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001865 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00001866 UTF32Test,
1867 UTF32LETest,
1868 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001869 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001870 UTF16LETest,
1871 UTF16BETest,
1872 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001873 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001874 UTF7Test,
1875 UTF16ExTest,
1876 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001877 RecodingTest,
1878 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001879 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001880 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001881 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001882 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001883 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001884 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001885 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001886 CharmapTest,
1887 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001888 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001889 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00001890 BomTest,
Georg Brandl02524622010-12-02 18:06:51 +00001891 TransformCodecTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001892 )
Fred Drake2e2be372001-09-20 21:33:42 +00001893
1894
1895if __name__ == "__main__":
1896 test_main()