blob: c0450e76def02daa2e6e3e1549f24c017e13f2f7 [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Victor Stinner98fe1a02011-05-27 01:51:18 +02002import unittest
Victor Stinner05010702011-05-27 16:50:40 +02003import codecs
4import sys, _testcapi, io
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000010 def __init__(self, buffer):
11 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000012
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000019 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000020 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwald3abcb012007-04-16 22:10:50 +000026class MixInCheckStateHandling:
27 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000028 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000029 d = codecs.getincrementaldecoder(encoding)()
30 part1 = d.decode(s[:i])
31 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000032 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000033 # Check that the condition stated in the documentation for
34 # IncrementalDecoder.getstate() holds
35 if not state[1]:
36 # reset decoder to the default state without anything buffered
37 d.setstate((state[0][:0], 0))
38 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000039 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000040 # The decoder must return to the same state
41 self.assertEqual(state, d.getstate())
42 # Create a new decoder and set it to the state
43 # we extracted from the old one
44 d = codecs.getincrementaldecoder(encoding)()
45 d.setstate(state)
46 part2 = d.decode(s[i:], True)
47 self.assertEqual(u, part1+part2)
48
49 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000050 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000051 d = codecs.getincrementalencoder(encoding)()
52 part1 = d.encode(u[:i])
53 state = d.getstate()
54 d = codecs.getincrementalencoder(encoding)()
55 d.setstate(state)
56 part2 = d.encode(u[i:], True)
57 self.assertEqual(s, part1+part2)
58
59class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000060 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000061 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000062 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000063 # the StreamReader and check that the results equal the appropriate
64 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000065 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020066 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000067 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000068 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000069 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000070 result += r.read()
71 self.assertEqual(result, partialresult)
72 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000073 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000074 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000075
Thomas Woutersa9773292006-04-21 09:43:23 +000076 # do the check again, this time using a incremental decoder
77 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000078 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000079 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000080 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000081 self.assertEqual(result, partialresult)
82 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000083 self.assertEqual(d.decode(b"", True), "")
84 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000085
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000086 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000087 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000088 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000089 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000090 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000091 self.assertEqual(result, partialresult)
92 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000093 self.assertEqual(d.decode(b"", True), "")
94 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000095
96 # check iterdecode()
97 encoded = input.encode(self.encoding)
98 self.assertEqual(
99 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000100 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000101 )
102
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000103 def test_readline(self):
104 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000105 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000106 return codecs.getreader(self.encoding)(stream)
107
Walter Dörwaldca199432006-03-06 22:39:12 +0000108 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200109 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000110 lines = []
111 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000112 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000113 if not line:
114 break
115 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000116 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000117
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000118 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
119 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
120 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000121 self.assertEqual(readalllines(s, True), sexpected)
122 self.assertEqual(readalllines(s, False), sexpectednoends)
123 self.assertEqual(readalllines(s, True, 10), sexpected)
124 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000125
126 # Test long lines (multiple calls to read() in readline())
127 vw = []
128 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000129 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
130 vw.append((i*200)*"\3042" + lineend)
131 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000132 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
133 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
134
135 # Test lines where the first read might end with \r, so the
136 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000137 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000138 for lineend in "\n \r\n \r \u2028".split():
139 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000140 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000141 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000142 self.assertEqual(
143 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000144 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000145 )
146 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000147 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000148 self.assertEqual(
149 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000150 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000151 )
152
153 def test_bug1175396(self):
154 s = [
155 '<%!--===================================================\r\n',
156 ' BLOG index page: show recent articles,\r\n',
157 ' today\'s articles, or articles of a specific date.\r\n',
158 '========================================================--%>\r\n',
159 '<%@inputencoding="ISO-8859-1"%>\r\n',
160 '<%@pagetemplate=TEMPLATE.y%>\r\n',
161 '<%@import=import frog.util, frog%>\r\n',
162 '<%@import=import frog.objects%>\r\n',
163 '<%@import=from frog.storageerrors import StorageError%>\r\n',
164 '<%\r\n',
165 '\r\n',
166 'import logging\r\n',
167 'log=logging.getLogger("Snakelets.logger")\r\n',
168 '\r\n',
169 '\r\n',
170 'user=self.SessionCtx.user\r\n',
171 'storageEngine=self.SessionCtx.storageEngine\r\n',
172 '\r\n',
173 '\r\n',
174 'def readArticlesFromDate(date, count=None):\r\n',
175 ' entryids=storageEngine.listBlogEntries(date)\r\n',
176 ' entryids.reverse() # descending\r\n',
177 ' if count:\r\n',
178 ' entryids=entryids[:count]\r\n',
179 ' try:\r\n',
180 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
181 ' except StorageError,x:\r\n',
182 ' log.error("Error loading articles: "+str(x))\r\n',
183 ' self.abort("cannot load articles")\r\n',
184 '\r\n',
185 'showdate=None\r\n',
186 '\r\n',
187 'arg=self.Request.getArg()\r\n',
188 'if arg=="today":\r\n',
189 ' #-------------------- TODAY\'S ARTICLES\r\n',
190 ' self.write("<h2>Today\'s articles</h2>")\r\n',
191 ' showdate = frog.util.isodatestr() \r\n',
192 ' entries = readArticlesFromDate(showdate)\r\n',
193 'elif arg=="active":\r\n',
194 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
195 ' self.Yredirect("active.y")\r\n',
196 'elif arg=="login":\r\n',
197 ' #-------------------- LOGIN PAGE redirect\r\n',
198 ' self.Yredirect("login.y")\r\n',
199 'elif arg=="date":\r\n',
200 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
201 ' showdate = self.Request.getParameter("date")\r\n',
202 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
203 ' entries = readArticlesFromDate(showdate)\r\n',
204 'else:\r\n',
205 ' #-------------------- RECENT ARTICLES\r\n',
206 ' self.write("<h2>Recent articles</h2>")\r\n',
207 ' dates=storageEngine.listBlogEntryDates()\r\n',
208 ' if dates:\r\n',
209 ' entries=[]\r\n',
210 ' SHOWAMOUNT=10\r\n',
211 ' for showdate in dates:\r\n',
212 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
213 ' if len(entries)>=SHOWAMOUNT:\r\n',
214 ' break\r\n',
215 ' \r\n',
216 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000217 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200218 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000219 for (i, line) in enumerate(reader):
220 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000221
222 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000223 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200224 writer = codecs.getwriter(self.encoding)(q)
225 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000226
227 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000228 writer.write("foo\r")
229 self.assertEqual(reader.readline(keepends=False), "foo")
230 writer.write("\nbar\r")
231 self.assertEqual(reader.readline(keepends=False), "")
232 self.assertEqual(reader.readline(keepends=False), "bar")
233 writer.write("baz")
234 self.assertEqual(reader.readline(keepends=False), "baz")
235 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000236
237 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000238 writer.write("foo\r")
239 self.assertEqual(reader.readline(keepends=True), "foo\r")
240 writer.write("\nbar\r")
241 self.assertEqual(reader.readline(keepends=True), "\n")
242 self.assertEqual(reader.readline(keepends=True), "bar\r")
243 writer.write("baz")
244 self.assertEqual(reader.readline(keepends=True), "baz")
245 self.assertEqual(reader.readline(keepends=True), "")
246 writer.write("foo\r\n")
247 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000248
Walter Dörwald9fa09462005-01-10 12:01:39 +0000249 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000250 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
251 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
252 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000253
254 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000255 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200256 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000257 self.assertEqual(reader.readline(), s1)
258 self.assertEqual(reader.readline(), s2)
259 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000260 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000261
262 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000263 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
264 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
265 s3 = "stillokay:bbbbxx\r\n"
266 s4 = "broken!!!!badbad\r\n"
267 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000268
269 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000270 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200271 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000272 self.assertEqual(reader.readline(), s1)
273 self.assertEqual(reader.readline(), s2)
274 self.assertEqual(reader.readline(), s3)
275 self.assertEqual(reader.readline(), s4)
276 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000277 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000278
Walter Dörwald41980ca2007-08-16 21:55:45 +0000279class UTF32Test(ReadTest):
280 encoding = "utf-32"
281
282 spamle = (b'\xff\xfe\x00\x00'
283 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
284 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
285 spambe = (b'\x00\x00\xfe\xff'
286 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
287 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
288
289 def test_only_one_bom(self):
290 _,_,reader,writer = codecs.lookup(self.encoding)
291 # encode some stream
292 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200293 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000294 f.write("spam")
295 f.write("spam")
296 d = s.getvalue()
297 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000298 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000299 # try to read it back
300 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200301 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000302 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000303
304 def test_badbom(self):
305 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200306 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000307 self.assertRaises(UnicodeError, f.read)
308
309 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200310 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000311 self.assertRaises(UnicodeError, f.read)
312
313 def test_partial(self):
314 self.check_partial(
315 "\x00\xff\u0100\uffff",
316 [
317 "", # first byte of BOM read
318 "", # second byte of BOM read
319 "", # third byte of BOM read
320 "", # fourth byte of BOM read => byteorder known
321 "",
322 "",
323 "",
324 "\x00",
325 "\x00",
326 "\x00",
327 "\x00",
328 "\x00\xff",
329 "\x00\xff",
330 "\x00\xff",
331 "\x00\xff",
332 "\x00\xff\u0100",
333 "\x00\xff\u0100",
334 "\x00\xff\u0100",
335 "\x00\xff\u0100",
336 "\x00\xff\u0100\uffff",
337 ]
338 )
339
Georg Brandl791f4e12009-09-17 11:41:24 +0000340 def test_handlers(self):
341 self.assertEqual(('\ufffd', 1),
342 codecs.utf_32_decode(b'\x01', 'replace', True))
343 self.assertEqual(('', 1),
344 codecs.utf_32_decode(b'\x01', 'ignore', True))
345
Walter Dörwald41980ca2007-08-16 21:55:45 +0000346 def test_errors(self):
347 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
348 b"\xff", "strict", True)
349
350 def test_decoder_state(self):
351 self.check_state_handling_decode(self.encoding,
352 "spamspam", self.spamle)
353 self.check_state_handling_decode(self.encoding,
354 "spamspam", self.spambe)
355
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000356 def test_issue8941(self):
357 # Issue #8941: insufficient result allocation when decoding into
358 # surrogate pairs on UCS-2 builds.
359 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
360 self.assertEqual('\U00010000' * 1024,
361 codecs.utf_32_decode(encoded_le)[0])
362 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
363 self.assertEqual('\U00010000' * 1024,
364 codecs.utf_32_decode(encoded_be)[0])
365
Walter Dörwald41980ca2007-08-16 21:55:45 +0000366class UTF32LETest(ReadTest):
367 encoding = "utf-32-le"
368
369 def test_partial(self):
370 self.check_partial(
371 "\x00\xff\u0100\uffff",
372 [
373 "",
374 "",
375 "",
376 "\x00",
377 "\x00",
378 "\x00",
379 "\x00",
380 "\x00\xff",
381 "\x00\xff",
382 "\x00\xff",
383 "\x00\xff",
384 "\x00\xff\u0100",
385 "\x00\xff\u0100",
386 "\x00\xff\u0100",
387 "\x00\xff\u0100",
388 "\x00\xff\u0100\uffff",
389 ]
390 )
391
392 def test_simple(self):
393 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
394
395 def test_errors(self):
396 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
397 b"\xff", "strict", True)
398
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000399 def test_issue8941(self):
400 # Issue #8941: insufficient result allocation when decoding into
401 # surrogate pairs on UCS-2 builds.
402 encoded = b'\x00\x00\x01\x00' * 1024
403 self.assertEqual('\U00010000' * 1024,
404 codecs.utf_32_le_decode(encoded)[0])
405
Walter Dörwald41980ca2007-08-16 21:55:45 +0000406class UTF32BETest(ReadTest):
407 encoding = "utf-32-be"
408
409 def test_partial(self):
410 self.check_partial(
411 "\x00\xff\u0100\uffff",
412 [
413 "",
414 "",
415 "",
416 "\x00",
417 "\x00",
418 "\x00",
419 "\x00",
420 "\x00\xff",
421 "\x00\xff",
422 "\x00\xff",
423 "\x00\xff",
424 "\x00\xff\u0100",
425 "\x00\xff\u0100",
426 "\x00\xff\u0100",
427 "\x00\xff\u0100",
428 "\x00\xff\u0100\uffff",
429 ]
430 )
431
432 def test_simple(self):
433 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
434
435 def test_errors(self):
436 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
437 b"\xff", "strict", True)
438
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000439 def test_issue8941(self):
440 # Issue #8941: insufficient result allocation when decoding into
441 # surrogate pairs on UCS-2 builds.
442 encoded = b'\x00\x01\x00\x00' * 1024
443 self.assertEqual('\U00010000' * 1024,
444 codecs.utf_32_be_decode(encoded)[0])
445
446
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000447class UTF16Test(ReadTest):
448 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000449
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000450 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
451 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000452
453 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000454 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000455 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000456 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200457 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000458 f.write("spam")
459 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000460 d = s.getvalue()
461 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000462 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000463 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000464 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200465 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000466 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000467
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000468 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000469 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200470 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000471 self.assertRaises(UnicodeError, f.read)
472
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000473 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200474 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000475 self.assertRaises(UnicodeError, f.read)
476
Walter Dörwald69652032004-09-07 20:24:22 +0000477 def test_partial(self):
478 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000479 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000480 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000481 "", # first byte of BOM read
482 "", # second byte of BOM read => byteorder known
483 "",
484 "\x00",
485 "\x00",
486 "\x00\xff",
487 "\x00\xff",
488 "\x00\xff\u0100",
489 "\x00\xff\u0100",
490 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000491 ]
492 )
493
Georg Brandl791f4e12009-09-17 11:41:24 +0000494 def test_handlers(self):
495 self.assertEqual(('\ufffd', 1),
496 codecs.utf_16_decode(b'\x01', 'replace', True))
497 self.assertEqual(('', 1),
498 codecs.utf_16_decode(b'\x01', 'ignore', True))
499
Walter Dörwalde22d3392005-11-17 08:52:34 +0000500 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000501 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000502 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000503
504 def test_decoder_state(self):
505 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000506 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000507 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000508 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000509
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000510 def test_bug691291(self):
511 # Files are always opened in binary mode, even if no binary mode was
512 # specified. This means that no automatic conversion of '\n' is done
513 # on reading and writing.
514 s1 = 'Hello\r\nworld\r\n'
515
516 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200517 self.addCleanup(support.unlink, support.TESTFN)
518 with open(support.TESTFN, 'wb') as fp:
519 fp.write(s)
Victor Stinner05010702011-05-27 16:50:40 +0200520 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200521 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000522
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000523class UTF16LETest(ReadTest):
524 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000525
526 def test_partial(self):
527 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000528 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000529 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000530 "",
531 "\x00",
532 "\x00",
533 "\x00\xff",
534 "\x00\xff",
535 "\x00\xff\u0100",
536 "\x00\xff\u0100",
537 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000538 ]
539 )
540
Walter Dörwalde22d3392005-11-17 08:52:34 +0000541 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000542 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000543 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000544
Victor Stinner53a9dd72010-12-08 22:25:45 +0000545 def test_nonbmp(self):
546 self.assertEqual("\U00010203".encode(self.encoding),
547 b'\x00\xd8\x03\xde')
548 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
549 "\U00010203")
550
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000551class UTF16BETest(ReadTest):
552 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000553
554 def test_partial(self):
555 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000556 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000557 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000558 "",
559 "\x00",
560 "\x00",
561 "\x00\xff",
562 "\x00\xff",
563 "\x00\xff\u0100",
564 "\x00\xff\u0100",
565 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000566 ]
567 )
568
Walter Dörwalde22d3392005-11-17 08:52:34 +0000569 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000570 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000571 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000572
Victor Stinner53a9dd72010-12-08 22:25:45 +0000573 def test_nonbmp(self):
574 self.assertEqual("\U00010203".encode(self.encoding),
575 b'\xd8\x00\xde\x03')
576 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
577 "\U00010203")
578
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000579class UTF8Test(ReadTest):
580 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000581
582 def test_partial(self):
583 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000584 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000585 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000586 "\x00",
587 "\x00",
588 "\x00\xff",
589 "\x00\xff",
590 "\x00\xff\u07ff",
591 "\x00\xff\u07ff",
592 "\x00\xff\u07ff",
593 "\x00\xff\u07ff\u0800",
594 "\x00\xff\u07ff\u0800",
595 "\x00\xff\u07ff\u0800",
596 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000597 ]
598 )
599
Walter Dörwald3abcb012007-04-16 22:10:50 +0000600 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000601 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000602 self.check_state_handling_decode(self.encoding,
603 u, u.encode(self.encoding))
604
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000605 def test_lone_surrogates(self):
606 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
607 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000608 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
609 b'[\\udc80]')
610 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
611 b'[&#56448;]')
612 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
613 b'[\x80]')
614 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
615 b'[]')
616 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
617 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000618
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000619 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000620 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
621 b"abc\xed\xa0\x80def")
622 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
623 "abc\ud800def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000624 self.assertTrue(codecs.lookup_error("surrogatepass"))
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000625
Walter Dörwalde22d3392005-11-17 08:52:34 +0000626class UTF7Test(ReadTest):
627 encoding = "utf-7"
628
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000629 def test_partial(self):
630 self.check_partial(
631 "a+-b",
632 [
633 "a",
634 "a",
635 "a+",
636 "a+-",
637 "a+-b",
638 ]
639 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000640
641class UTF16ExTest(unittest.TestCase):
642
643 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000644 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000645
646 def test_bad_args(self):
647 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
648
649class ReadBufferTest(unittest.TestCase):
650
651 def test_array(self):
652 import array
653 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000654 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000655 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000656 )
657
658 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000659 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000660
661 def test_bad_args(self):
662 self.assertRaises(TypeError, codecs.readbuffer_encode)
663 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
664
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000665class UTF8SigTest(ReadTest):
666 encoding = "utf-8-sig"
667
668 def test_partial(self):
669 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000670 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000671 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000672 "",
673 "",
674 "", # First BOM has been read and skipped
675 "",
676 "",
677 "\ufeff", # Second BOM has been read and emitted
678 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000679 "\ufeff\x00", # First byte of encoded "\xff" read
680 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
681 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
682 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000683 "\ufeff\x00\xff\u07ff",
684 "\ufeff\x00\xff\u07ff",
685 "\ufeff\x00\xff\u07ff\u0800",
686 "\ufeff\x00\xff\u07ff\u0800",
687 "\ufeff\x00\xff\u07ff\u0800",
688 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000689 ]
690 )
691
Thomas Wouters89f507f2006-12-13 04:49:30 +0000692 def test_bug1601501(self):
693 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000694 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000695
Walter Dörwald3abcb012007-04-16 22:10:50 +0000696 def test_bom(self):
697 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000698 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000699 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
700
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000701 def test_stream_bom(self):
702 unistring = "ABC\u00A1\u2200XYZ"
703 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
704
705 reader = codecs.getreader("utf-8-sig")
706 for sizehint in [None] + list(range(1, 11)) + \
707 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200708 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000709 ostream = io.StringIO()
710 while 1:
711 if sizehint is not None:
712 data = istream.read(sizehint)
713 else:
714 data = istream.read()
715
716 if not data:
717 break
718 ostream.write(data)
719
720 got = ostream.getvalue()
721 self.assertEqual(got, unistring)
722
723 def test_stream_bare(self):
724 unistring = "ABC\u00A1\u2200XYZ"
725 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
726
727 reader = codecs.getreader("utf-8-sig")
728 for sizehint in [None] + list(range(1, 11)) + \
729 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200730 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000731 ostream = io.StringIO()
732 while 1:
733 if sizehint is not None:
734 data = istream.read(sizehint)
735 else:
736 data = istream.read()
737
738 if not data:
739 break
740 ostream.write(data)
741
742 got = ostream.getvalue()
743 self.assertEqual(got, unistring)
744
745class EscapeDecodeTest(unittest.TestCase):
746 def test_empty(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000747 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000748
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000749class RecodingTest(unittest.TestCase):
750 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000751 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200752 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000753 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000754 f2.close()
755 # Python used to crash on this at exit because of a refcount
756 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000757
Martin v. Löwis2548c732003-04-18 10:39:54 +0000758# From RFC 3492
759punycode_testcases = [
760 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000761 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
762 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000763 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000764 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000765 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000766 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000767 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000768 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000769 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000770 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000771 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
772 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
773 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000774 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000775 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000776 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
777 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
778 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000779 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000780 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000781 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000782 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
783 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
784 "\u0939\u0948\u0902",
785 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000786
787 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000788 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000789 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
790 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000791
792 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000793 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
794 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
795 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000796 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
797 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000798
799 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000800 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
801 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
802 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
803 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000804 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000805
806 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000807 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
808 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
809 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
810 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
811 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000812 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000813
814 # (K) Vietnamese:
815 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
816 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000817 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
818 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
819 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
820 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000821 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000822
Martin v. Löwis2548c732003-04-18 10:39:54 +0000823 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000824 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000825 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000826
Martin v. Löwis2548c732003-04-18 10:39:54 +0000827 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000828 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
829 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
830 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000831 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000832
833 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000834 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
835 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
836 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000837 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000838
839 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000840 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000841 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000842
843 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000844 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
845 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000846 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000847
848 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000849 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000850 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000851
852 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000853 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000854 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000855
856 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000857 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
858 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000859 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000860 ]
861
862for i in punycode_testcases:
863 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000864 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000865
866class PunycodeTest(unittest.TestCase):
867 def test_encode(self):
868 for uni, puny in punycode_testcases:
869 # Need to convert both strings to lower case, since
870 # some of the extended encodings use upper case, but our
871 # code produces only lower case. Converting just puny to
872 # lower is also insufficient, since some of the input characters
873 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +0000874 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +0000875 str(uni.encode("punycode"), "ascii").lower(),
876 str(puny, "ascii").lower()
877 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000878
879 def test_decode(self):
880 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000881 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000882 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000883 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000884
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000885class UnicodeInternalTest(unittest.TestCase):
886 def test_bug1251300(self):
887 # Decoding with unicode_internal used to not correctly handle "code
888 # points" above 0x10ffff on UCS-4 builds.
889 if sys.maxunicode > 0xffff:
890 ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000891 (b"\x00\x10\xff\xff", "\U0010ffff"),
892 (b"\x00\x00\x01\x01", "\U00000101"),
893 (b"", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000894 ]
895 not_ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000896 b"\x7f\xff\xff\xff",
897 b"\x80\x00\x00\x00",
898 b"\x81\x00\x00\x00",
899 b"\x00",
900 b"\x00\x00\x00\x00\x00",
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000901 ]
902 for internal, uni in ok:
903 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000904 internal = bytes(reversed(internal))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000905 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000906 for internal in not_ok:
907 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000908 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000909 self.assertRaises(UnicodeDecodeError, internal.decode,
910 "unicode_internal")
911
912 def test_decode_error_attributes(self):
913 if sys.maxunicode > 0xffff:
914 try:
Walter Dörwald092a2252007-06-07 11:26:16 +0000915 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +0000916 except UnicodeDecodeError as ex:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000917 self.assertEqual("unicode_internal", ex.encoding)
918 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
919 self.assertEqual(4, ex.start)
920 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000921 else:
922 self.fail()
923
924 def test_decode_callback(self):
925 if sys.maxunicode > 0xffff:
926 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
927 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossum98297ee2007-11-06 21:34:58 +0000928 ab = "ab".encode("unicode_internal").decode()
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000929 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
930 "ascii"),
931 "UnicodeInternalTest")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000932 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000933
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000934 def test_encode_length(self):
935 # Issue 3739
936 encoder = codecs.getencoder("unicode_internal")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000937 self.assertEqual(encoder("a")[1], 1)
938 self.assertEqual(encoder("\xe9\u0142")[1], 2)
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000939
Ezio Melottib3aedd42010-11-20 19:04:17 +0000940 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +0000941
Martin v. Löwis2548c732003-04-18 10:39:54 +0000942# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
943nameprep_tests = [
944 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000945 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
946 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
947 b'\xb8\x8f\xef\xbb\xbf',
948 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000949 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000950 (b'CAFE',
951 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000952 # 3.3 Case folding 8bit U+00DF (german sharp s).
953 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000954 (b'\xc3\x9f',
955 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000956 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000957 (b'\xc4\xb0',
958 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000959 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000960 (b'\xc5\x83\xcd\xba',
961 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000962 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
963 # XXX: skip this as it fails in UCS-2 mode
964 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
965 # 'telc\xe2\x88\x95kg\xcf\x83'),
966 (None, None),
967 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000968 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
969 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000970 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000971 (b'\xe1\xbe\xb7',
972 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000973 # 3.9 Self-reverting case folding U+01F0 and normalization.
974 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000975 (b'\xc7\xb0',
976 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000977 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000978 (b'\xce\x90',
979 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000980 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000981 (b'\xce\xb0',
982 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000983 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000984 (b'\xe1\xba\x96',
985 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000986 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000987 (b'\xe1\xbd\x96',
988 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000989 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000990 (b' ',
991 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000992 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000993 (b'\xc2\xa0',
994 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000995 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000996 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000997 None),
998 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000999 (b'\xe2\x80\x80',
1000 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001001 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001002 (b'\xe2\x80\x8b',
1003 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001004 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001005 (b'\xe3\x80\x80',
1006 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001007 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001008 (b'\x10\x7f',
1009 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001010 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001011 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001012 None),
1013 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001014 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001015 None),
1016 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001017 (b'\xef\xbb\xbf',
1018 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001019 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001020 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001021 None),
1022 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001023 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001024 None),
1025 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001026 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001027 None),
1028 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001029 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001030 None),
1031 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001032 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001033 None),
1034 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001035 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001036 None),
1037 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001038 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001039 None),
1040 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001041 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001042 None),
1043 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001044 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001045 None),
1046 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001047 (b'\xcd\x81',
1048 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001049 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001050 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001051 None),
1052 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001053 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001054 None),
1055 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001056 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001057 None),
1058 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001059 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001060 None),
1061 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001062 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001063 None),
1064 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001065 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001066 None),
1067 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001068 (b'foo\xef\xb9\xb6bar',
1069 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001070 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001071 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001072 None),
1073 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001074 (b'\xd8\xa71\xd8\xa8',
1075 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001076 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001077 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001078 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001079 # None),
1080 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001081 # 3.44 Larger test (shrinking).
1082 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001083 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1084 b'\xaa\xce\xb0\xe2\x80\x80',
1085 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001086 # 3.45 Larger test (expanding).
1087 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001088 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1089 b'\x80',
1090 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1091 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1092 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001093 ]
1094
1095
1096class NameprepTest(unittest.TestCase):
1097 def test_nameprep(self):
1098 from encodings.idna import nameprep
1099 for pos, (orig, prepped) in enumerate(nameprep_tests):
1100 if orig is None:
1101 # Skipped
1102 continue
1103 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001104 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001105 if prepped is None:
1106 # Input contains prohibited characters
1107 self.assertRaises(UnicodeError, nameprep, orig)
1108 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001109 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001110 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001111 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001112 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001113 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001114
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001115class IDNACodecTest(unittest.TestCase):
1116 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001117 self.assertEqual(str(b"python.org", "idna"), "python.org")
1118 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1119 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1120 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001121
1122 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001123 self.assertEqual("python.org".encode("idna"), b"python.org")
1124 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1125 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1126 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001127
Martin v. Löwis8b595142005-08-25 11:03:38 +00001128 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001129 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001130 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001131 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001132
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001133 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001134 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001135 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001136 "python.org"
1137 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001138 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001139 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001140 "python.org."
1141 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001142 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001143 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001144 "pyth\xf6n.org."
1145 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001146 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001147 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001148 "pyth\xf6n.org."
1149 )
1150
1151 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001152 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1153 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1154 self.assertEqual(decoder.decode(b"rg"), "")
1155 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001156
1157 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001158 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1159 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1160 self.assertEqual(decoder.decode(b"rg."), "org.")
1161 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001162
1163 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001164 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001165 b"".join(codecs.iterencode("python.org", "idna")),
1166 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001167 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001168 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001169 b"".join(codecs.iterencode("python.org.", "idna")),
1170 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001171 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001172 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001173 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1174 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001175 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001176 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001177 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1178 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001179 )
1180
1181 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001182 self.assertEqual(encoder.encode("\xe4x"), b"")
1183 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1184 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001185
1186 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001187 self.assertEqual(encoder.encode("\xe4x"), b"")
1188 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1189 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001190
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001191class CodecsModuleTest(unittest.TestCase):
1192
1193 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001194 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1195 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001196 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001197 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001198 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001199
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001200 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001201 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1202 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001203 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001204 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001205 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001206 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001207
1208 def test_register(self):
1209 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001210 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001211
1212 def test_lookup(self):
1213 self.assertRaises(TypeError, codecs.lookup)
1214 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001215 self.assertRaises(LookupError, codecs.lookup, " ")
1216
1217 def test_getencoder(self):
1218 self.assertRaises(TypeError, codecs.getencoder)
1219 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1220
1221 def test_getdecoder(self):
1222 self.assertRaises(TypeError, codecs.getdecoder)
1223 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1224
1225 def test_getreader(self):
1226 self.assertRaises(TypeError, codecs.getreader)
1227 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1228
1229 def test_getwriter(self):
1230 self.assertRaises(TypeError, codecs.getwriter)
1231 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001232
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001233class StreamReaderTest(unittest.TestCase):
1234
1235 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001236 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001237 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001238
1239 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001240 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001241 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001242
Thomas Wouters89f507f2006-12-13 04:49:30 +00001243class EncodedFileTest(unittest.TestCase):
1244
1245 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001246 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001247 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001248 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001249
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001250 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001251 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001252 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001253 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001254
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001255all_unicode_encodings = [
1256 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001257 "big5",
1258 "big5hkscs",
1259 "charmap",
1260 "cp037",
1261 "cp1006",
1262 "cp1026",
1263 "cp1140",
1264 "cp1250",
1265 "cp1251",
1266 "cp1252",
1267 "cp1253",
1268 "cp1254",
1269 "cp1255",
1270 "cp1256",
1271 "cp1257",
1272 "cp1258",
1273 "cp424",
1274 "cp437",
1275 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001276 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001277 "cp737",
1278 "cp775",
1279 "cp850",
1280 "cp852",
1281 "cp855",
1282 "cp856",
1283 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001284 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001285 "cp860",
1286 "cp861",
1287 "cp862",
1288 "cp863",
1289 "cp864",
1290 "cp865",
1291 "cp866",
1292 "cp869",
1293 "cp874",
1294 "cp875",
1295 "cp932",
1296 "cp949",
1297 "cp950",
1298 "euc_jis_2004",
1299 "euc_jisx0213",
1300 "euc_jp",
1301 "euc_kr",
1302 "gb18030",
1303 "gb2312",
1304 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001305 "hp_roman8",
1306 "hz",
1307 "idna",
1308 "iso2022_jp",
1309 "iso2022_jp_1",
1310 "iso2022_jp_2",
1311 "iso2022_jp_2004",
1312 "iso2022_jp_3",
1313 "iso2022_jp_ext",
1314 "iso2022_kr",
1315 "iso8859_1",
1316 "iso8859_10",
1317 "iso8859_11",
1318 "iso8859_13",
1319 "iso8859_14",
1320 "iso8859_15",
1321 "iso8859_16",
1322 "iso8859_2",
1323 "iso8859_3",
1324 "iso8859_4",
1325 "iso8859_5",
1326 "iso8859_6",
1327 "iso8859_7",
1328 "iso8859_8",
1329 "iso8859_9",
1330 "johab",
1331 "koi8_r",
1332 "koi8_u",
1333 "latin_1",
1334 "mac_cyrillic",
1335 "mac_greek",
1336 "mac_iceland",
1337 "mac_latin2",
1338 "mac_roman",
1339 "mac_turkish",
1340 "palmos",
1341 "ptcp154",
1342 "punycode",
1343 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001344 "shift_jis",
1345 "shift_jis_2004",
1346 "shift_jisx0213",
1347 "tis_620",
1348 "unicode_escape",
1349 "unicode_internal",
1350 "utf_16",
1351 "utf_16_be",
1352 "utf_16_le",
1353 "utf_7",
1354 "utf_8",
1355]
1356
1357if hasattr(codecs, "mbcs_encode"):
1358 all_unicode_encodings.append("mbcs")
1359
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001360# The following encoding is not tested, because it's not supposed
1361# to work:
1362# "undefined"
1363
1364# The following encodings don't work in stateful mode
1365broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001366 "punycode",
1367 "unicode_internal"
1368]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001369broken_incremental_coders = broken_unicode_with_streams + [
1370 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001371]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001372
Walter Dörwald3abcb012007-04-16 22:10:50 +00001373class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001374 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001375 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001376 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001377 name = codecs.lookup(encoding).name
1378 if encoding.endswith("_codec"):
1379 name += "_codec"
1380 elif encoding == "latin_1":
1381 name = "latin_1"
1382 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001383 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001384 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001385 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001386 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1387
1388 if encoding not in broken_unicode_with_streams:
1389 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001390 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001391 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001392 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001393 for c in s:
1394 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001395 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001396 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001397 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001398 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001399 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001400 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001401 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001402 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001403 decodedresult += reader.read()
1404 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1405
Thomas Wouters89f507f2006-12-13 04:49:30 +00001406 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001407 # check incremental decoder/encoder (fetched via the Python
1408 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001409 try:
1410 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001411 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001412 except LookupError: # no IncrementalEncoder
1413 pass
1414 else:
1415 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001416 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001417 for c in s:
1418 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001419 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001420 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001421 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001422 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001423 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001424 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001425 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1426
1427 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001428 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001429 for c in s:
1430 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001431 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001432 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001433 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001434 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001435 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001436 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001437 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1438
1439 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001440 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001441 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1442
1443 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001444 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1445 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001446
Victor Stinner554f3f02010-06-16 23:33:54 +00001447 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001448 # check incremental decoder/encoder with errors argument
1449 try:
1450 encoder = codecs.getincrementalencoder(encoding)("ignore")
1451 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1452 except LookupError: # no IncrementalEncoder
1453 pass
1454 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001455 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001456 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001457 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001458 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1459
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001460 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001461 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001462 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001463 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1464
Walter Dörwald729c31f2005-03-14 19:06:30 +00001465 def test_seek(self):
1466 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001467 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001468 for encoding in all_unicode_encodings:
1469 if encoding == "idna": # FIXME: See SF bug #1163178
1470 continue
1471 if encoding in broken_unicode_with_streams:
1472 continue
Victor Stinner05010702011-05-27 16:50:40 +02001473 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001474 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001475 # Test that calling seek resets the internal codec state and buffers
1476 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001477 data = reader.read()
1478 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001479
Walter Dörwalde22d3392005-11-17 08:52:34 +00001480 def test_bad_decode_args(self):
1481 for encoding in all_unicode_encodings:
1482 decoder = codecs.getdecoder(encoding)
1483 self.assertRaises(TypeError, decoder)
1484 if encoding not in ("idna", "punycode"):
1485 self.assertRaises(TypeError, decoder, 42)
1486
1487 def test_bad_encode_args(self):
1488 for encoding in all_unicode_encodings:
1489 encoder = codecs.getencoder(encoding)
1490 self.assertRaises(TypeError, encoder)
1491
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001492 def test_encoding_map_type_initialized(self):
1493 from encodings import cp1140
1494 # This used to crash, we are only verifying there's no crash.
1495 table_type = type(cp1140.encoding_table)
1496 self.assertEqual(table_type, table_type)
1497
Walter Dörwald3abcb012007-04-16 22:10:50 +00001498 def test_decoder_state(self):
1499 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001500 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001501 for encoding in all_unicode_encodings:
1502 if encoding not in broken_incremental_coders:
1503 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1504 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1505
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001506class CharmapTest(unittest.TestCase):
1507 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001508 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001509 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001510 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001511 )
1512
Ezio Melottib3aedd42010-11-20 19:04:17 +00001513 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001514 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001515 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001516 )
1517
Ezio Melottib3aedd42010-11-20 19:04:17 +00001518 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001519 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001520 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001521 )
1522
Ezio Melottib3aedd42010-11-20 19:04:17 +00001523 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001524 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001525 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001526 )
1527
Ezio Melottib3aedd42010-11-20 19:04:17 +00001528 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001529 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001530 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001531 )
1532
Guido van Rossum805365e2007-05-07 22:24:25 +00001533 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001534 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001535 codecs.charmap_decode(allbytes, "ignore", ""),
1536 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001537 )
1538
Thomas Wouters89f507f2006-12-13 04:49:30 +00001539class WithStmtTest(unittest.TestCase):
1540 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001541 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02001542 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1543 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001544
1545 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001546 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001547 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02001548 with codecs.StreamReaderWriter(f, info.streamreader,
1549 info.streamwriter, 'strict') as srw:
1550 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001551
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001552class TypesTest(unittest.TestCase):
1553 def test_decode_unicode(self):
1554 # Most decoders don't accept unicode input
1555 decoders = [
1556 codecs.utf_7_decode,
1557 codecs.utf_8_decode,
1558 codecs.utf_16_le_decode,
1559 codecs.utf_16_be_decode,
1560 codecs.utf_16_ex_decode,
1561 codecs.utf_32_decode,
1562 codecs.utf_32_le_decode,
1563 codecs.utf_32_be_decode,
1564 codecs.utf_32_ex_decode,
1565 codecs.latin_1_decode,
1566 codecs.ascii_decode,
1567 codecs.charmap_decode,
1568 ]
1569 if hasattr(codecs, "mbcs_decode"):
1570 decoders.append(codecs.mbcs_decode)
1571 for decoder in decoders:
1572 self.assertRaises(TypeError, decoder, "xxx")
1573
1574 def test_unicode_escape(self):
1575 # Escape-decoding an unicode string is supported ang gives the same
1576 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001577 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1578 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1579 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1580 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001581
Martin v. Löwis43c57782009-05-10 08:15:24 +00001582class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001583
1584 def test_utf8(self):
1585 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001586 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001587 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001588 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001589 b"foo\x80bar")
1590 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001591 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001592 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001593 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001594 b"\xed\xb0\x80")
1595
1596 def test_ascii(self):
1597 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001598 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001599 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001600 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001601 b"foo\x80bar")
1602
1603 def test_charmap(self):
1604 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001605 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001606 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001607 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001608 b"foo\xa5bar")
1609
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001610 def test_latin1(self):
1611 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001612 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001613 b"\xe4\xeb\xef\xf6\xfc")
1614
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001615
Victor Stinner3fed0872010-05-22 02:16:27 +00001616class BomTest(unittest.TestCase):
1617 def test_seek0(self):
1618 data = "1234567890"
1619 tests = ("utf-16",
1620 "utf-16-le",
1621 "utf-16-be",
1622 "utf-32",
1623 "utf-32-le",
1624 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02001625 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00001626 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001627 # Check if the BOM is written only once
1628 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001629 f.write(data)
1630 f.write(data)
1631 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001632 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001633 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001634 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001635
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001636 # Check that the BOM is written after a seek(0)
1637 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1638 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001639 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001640 f.seek(0)
1641 f.write(data)
1642 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001643 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001644
1645 # (StreamWriter) Check that the BOM is written after a seek(0)
1646 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02001647 f.writer.write(data[0])
1648 self.assertNotEqual(f.writer.tell(), 0)
1649 f.writer.seek(0)
1650 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001651 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001652 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001653
Victor Stinner05010702011-05-27 16:50:40 +02001654 # Check that the BOM is not written after a seek() at a position
1655 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001656 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1657 f.write(data)
1658 f.seek(f.tell())
1659 f.write(data)
1660 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001661 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001662
Victor Stinner05010702011-05-27 16:50:40 +02001663 # (StreamWriter) Check that the BOM is not written after a seek()
1664 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001665 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02001666 f.writer.write(data)
1667 f.writer.seek(f.writer.tell())
1668 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001669 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001670 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001671
Victor Stinner3fed0872010-05-22 02:16:27 +00001672
Georg Brandl02524622010-12-02 18:06:51 +00001673bytes_transform_encodings = [
1674 "base64_codec",
1675 "uu_codec",
1676 "quopri_codec",
1677 "hex_codec",
1678]
1679try:
1680 import zlib
1681except ImportError:
1682 pass
1683else:
1684 bytes_transform_encodings.append("zlib_codec")
1685try:
1686 import bz2
1687except ImportError:
1688 pass
1689else:
1690 bytes_transform_encodings.append("bz2_codec")
1691
1692class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001693
Georg Brandl02524622010-12-02 18:06:51 +00001694 def test_basics(self):
1695 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00001696 for encoding in bytes_transform_encodings:
1697 # generic codecs interface
1698 (o, size) = codecs.getencoder(encoding)(binput)
1699 self.assertEqual(size, len(binput))
1700 (i, size) = codecs.getdecoder(encoding)(o)
1701 self.assertEqual(size, len(o))
1702 self.assertEqual(i, binput)
1703
Georg Brandl02524622010-12-02 18:06:51 +00001704 def test_read(self):
1705 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001706 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02001707 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00001708 sout = reader.read()
1709 self.assertEqual(sout, b"\x80")
1710
1711 def test_readline(self):
1712 for encoding in bytes_transform_encodings:
1713 if encoding in ['uu_codec', 'zlib_codec']:
1714 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001715 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02001716 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00001717 sout = reader.readline()
1718 self.assertEqual(sout, b"\x80")
1719
1720
Fred Drake2e2be372001-09-20 21:33:42 +00001721def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001722 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00001723 UTF32Test,
1724 UTF32LETest,
1725 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001726 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001727 UTF16LETest,
1728 UTF16BETest,
1729 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001730 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001731 UTF7Test,
1732 UTF16ExTest,
1733 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001734 RecodingTest,
1735 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001736 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001737 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001738 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001739 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001740 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001741 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001742 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001743 CharmapTest,
1744 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001745 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001746 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00001747 BomTest,
Georg Brandl02524622010-12-02 18:06:51 +00001748 TransformCodecTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001749 )
Fred Drake2e2be372001-09-20 21:33:42 +00001750
1751
1752if __name__ == "__main__":
1753 test_main()