blob: 17038cb01b5a2dc3d4aab38b2ab971a8358131ad [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Victor Stinner98fe1a02011-05-27 01:51:18 +02002import unittest
Victor Stinner05010702011-05-27 16:50:40 +02003import codecs
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner05010702011-05-27 16:50:40 +02005import sys, _testcapi, io
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Walter Dörwald69652032004-09-07 20:24:22 +00007class Queue(object):
8 """
9 queue: write bytes at one end, read bytes from the other end
10 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000011 def __init__(self, buffer):
12 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000013
14 def write(self, chars):
15 self._buffer += chars
16
17 def read(self, size=-1):
18 if size<0:
19 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000020 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000021 return s
22 else:
23 s = self._buffer[:size]
24 self._buffer = self._buffer[size:]
25 return s
26
Walter Dörwald3abcb012007-04-16 22:10:50 +000027class MixInCheckStateHandling:
28 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000029 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000030 d = codecs.getincrementaldecoder(encoding)()
31 part1 = d.decode(s[:i])
32 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000033 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000034 # Check that the condition stated in the documentation for
35 # IncrementalDecoder.getstate() holds
36 if not state[1]:
37 # reset decoder to the default state without anything buffered
38 d.setstate((state[0][:0], 0))
39 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000040 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000041 # The decoder must return to the same state
42 self.assertEqual(state, d.getstate())
43 # Create a new decoder and set it to the state
44 # we extracted from the old one
45 d = codecs.getincrementaldecoder(encoding)()
46 d.setstate(state)
47 part2 = d.decode(s[i:], True)
48 self.assertEqual(u, part1+part2)
49
50 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000051 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000052 d = codecs.getincrementalencoder(encoding)()
53 part1 = d.encode(u[:i])
54 state = d.getstate()
55 d = codecs.getincrementalencoder(encoding)()
56 d.setstate(state)
57 part2 = d.encode(u[i:], True)
58 self.assertEqual(s, part1+part2)
59
60class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000061 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000062 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000063 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000064 # the StreamReader and check that the results equal the appropriate
65 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000066 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020067 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000068 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000069 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000070 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000071 result += r.read()
72 self.assertEqual(result, partialresult)
73 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000074 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000075 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000076
Thomas Woutersa9773292006-04-21 09:43:23 +000077 # do the check again, this time using a incremental decoder
78 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000079 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000080 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000081 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000082 self.assertEqual(result, partialresult)
83 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000084 self.assertEqual(d.decode(b"", True), "")
85 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000086
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000088 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000089 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000090 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000091 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000092 self.assertEqual(result, partialresult)
93 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000094 self.assertEqual(d.decode(b"", True), "")
95 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000096
97 # check iterdecode()
98 encoded = input.encode(self.encoding)
99 self.assertEqual(
100 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000102 )
103
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000104 def test_readline(self):
105 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000106 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000107 return codecs.getreader(self.encoding)(stream)
108
Walter Dörwaldca199432006-03-06 22:39:12 +0000109 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200110 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000111 lines = []
112 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000113 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000114 if not line:
115 break
116 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000117 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000118
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000119 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
120 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
121 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000122 self.assertEqual(readalllines(s, True), sexpected)
123 self.assertEqual(readalllines(s, False), sexpectednoends)
124 self.assertEqual(readalllines(s, True, 10), sexpected)
125 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000126
127 # Test long lines (multiple calls to read() in readline())
128 vw = []
129 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000130 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
131 vw.append((i*200)*"\3042" + lineend)
132 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000133 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
134 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
135
136 # Test lines where the first read might end with \r, so the
137 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000138 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000139 for lineend in "\n \r\n \r \u2028".split():
140 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000141 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000142 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000143 self.assertEqual(
144 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000145 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000146 )
147 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000148 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000149 self.assertEqual(
150 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000151 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000152 )
153
154 def test_bug1175396(self):
155 s = [
156 '<%!--===================================================\r\n',
157 ' BLOG index page: show recent articles,\r\n',
158 ' today\'s articles, or articles of a specific date.\r\n',
159 '========================================================--%>\r\n',
160 '<%@inputencoding="ISO-8859-1"%>\r\n',
161 '<%@pagetemplate=TEMPLATE.y%>\r\n',
162 '<%@import=import frog.util, frog%>\r\n',
163 '<%@import=import frog.objects%>\r\n',
164 '<%@import=from frog.storageerrors import StorageError%>\r\n',
165 '<%\r\n',
166 '\r\n',
167 'import logging\r\n',
168 'log=logging.getLogger("Snakelets.logger")\r\n',
169 '\r\n',
170 '\r\n',
171 'user=self.SessionCtx.user\r\n',
172 'storageEngine=self.SessionCtx.storageEngine\r\n',
173 '\r\n',
174 '\r\n',
175 'def readArticlesFromDate(date, count=None):\r\n',
176 ' entryids=storageEngine.listBlogEntries(date)\r\n',
177 ' entryids.reverse() # descending\r\n',
178 ' if count:\r\n',
179 ' entryids=entryids[:count]\r\n',
180 ' try:\r\n',
181 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
182 ' except StorageError,x:\r\n',
183 ' log.error("Error loading articles: "+str(x))\r\n',
184 ' self.abort("cannot load articles")\r\n',
185 '\r\n',
186 'showdate=None\r\n',
187 '\r\n',
188 'arg=self.Request.getArg()\r\n',
189 'if arg=="today":\r\n',
190 ' #-------------------- TODAY\'S ARTICLES\r\n',
191 ' self.write("<h2>Today\'s articles</h2>")\r\n',
192 ' showdate = frog.util.isodatestr() \r\n',
193 ' entries = readArticlesFromDate(showdate)\r\n',
194 'elif arg=="active":\r\n',
195 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
196 ' self.Yredirect("active.y")\r\n',
197 'elif arg=="login":\r\n',
198 ' #-------------------- LOGIN PAGE redirect\r\n',
199 ' self.Yredirect("login.y")\r\n',
200 'elif arg=="date":\r\n',
201 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
202 ' showdate = self.Request.getParameter("date")\r\n',
203 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
204 ' entries = readArticlesFromDate(showdate)\r\n',
205 'else:\r\n',
206 ' #-------------------- RECENT ARTICLES\r\n',
207 ' self.write("<h2>Recent articles</h2>")\r\n',
208 ' dates=storageEngine.listBlogEntryDates()\r\n',
209 ' if dates:\r\n',
210 ' entries=[]\r\n',
211 ' SHOWAMOUNT=10\r\n',
212 ' for showdate in dates:\r\n',
213 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
214 ' if len(entries)>=SHOWAMOUNT:\r\n',
215 ' break\r\n',
216 ' \r\n',
217 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000218 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200219 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000220 for (i, line) in enumerate(reader):
221 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000222
223 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000224 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200225 writer = codecs.getwriter(self.encoding)(q)
226 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000227
228 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000229 writer.write("foo\r")
230 self.assertEqual(reader.readline(keepends=False), "foo")
231 writer.write("\nbar\r")
232 self.assertEqual(reader.readline(keepends=False), "")
233 self.assertEqual(reader.readline(keepends=False), "bar")
234 writer.write("baz")
235 self.assertEqual(reader.readline(keepends=False), "baz")
236 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000237
238 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000239 writer.write("foo\r")
240 self.assertEqual(reader.readline(keepends=True), "foo\r")
241 writer.write("\nbar\r")
242 self.assertEqual(reader.readline(keepends=True), "\n")
243 self.assertEqual(reader.readline(keepends=True), "bar\r")
244 writer.write("baz")
245 self.assertEqual(reader.readline(keepends=True), "baz")
246 self.assertEqual(reader.readline(keepends=True), "")
247 writer.write("foo\r\n")
248 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000249
Walter Dörwald9fa09462005-01-10 12:01:39 +0000250 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000251 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
252 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
253 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000254
255 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000256 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200257 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000258 self.assertEqual(reader.readline(), s1)
259 self.assertEqual(reader.readline(), s2)
260 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000261 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000262
263 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000264 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
265 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
266 s3 = "stillokay:bbbbxx\r\n"
267 s4 = "broken!!!!badbad\r\n"
268 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000269
270 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000271 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200272 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000273 self.assertEqual(reader.readline(), s1)
274 self.assertEqual(reader.readline(), s2)
275 self.assertEqual(reader.readline(), s3)
276 self.assertEqual(reader.readline(), s4)
277 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000278 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000279
Walter Dörwald41980ca2007-08-16 21:55:45 +0000280class UTF32Test(ReadTest):
281 encoding = "utf-32"
282
283 spamle = (b'\xff\xfe\x00\x00'
284 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
285 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
286 spambe = (b'\x00\x00\xfe\xff'
287 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
288 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
289
290 def test_only_one_bom(self):
291 _,_,reader,writer = codecs.lookup(self.encoding)
292 # encode some stream
293 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200294 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000295 f.write("spam")
296 f.write("spam")
297 d = s.getvalue()
298 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000299 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000300 # try to read it back
301 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200302 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000303 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000304
305 def test_badbom(self):
306 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200307 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000308 self.assertRaises(UnicodeError, f.read)
309
310 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200311 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000312 self.assertRaises(UnicodeError, f.read)
313
314 def test_partial(self):
315 self.check_partial(
316 "\x00\xff\u0100\uffff",
317 [
318 "", # first byte of BOM read
319 "", # second byte of BOM read
320 "", # third byte of BOM read
321 "", # fourth byte of BOM read => byteorder known
322 "",
323 "",
324 "",
325 "\x00",
326 "\x00",
327 "\x00",
328 "\x00",
329 "\x00\xff",
330 "\x00\xff",
331 "\x00\xff",
332 "\x00\xff",
333 "\x00\xff\u0100",
334 "\x00\xff\u0100",
335 "\x00\xff\u0100",
336 "\x00\xff\u0100",
337 "\x00\xff\u0100\uffff",
338 ]
339 )
340
Georg Brandl791f4e12009-09-17 11:41:24 +0000341 def test_handlers(self):
342 self.assertEqual(('\ufffd', 1),
343 codecs.utf_32_decode(b'\x01', 'replace', True))
344 self.assertEqual(('', 1),
345 codecs.utf_32_decode(b'\x01', 'ignore', True))
346
Walter Dörwald41980ca2007-08-16 21:55:45 +0000347 def test_errors(self):
348 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
349 b"\xff", "strict", True)
350
351 def test_decoder_state(self):
352 self.check_state_handling_decode(self.encoding,
353 "spamspam", self.spamle)
354 self.check_state_handling_decode(self.encoding,
355 "spamspam", self.spambe)
356
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000357 def test_issue8941(self):
358 # Issue #8941: insufficient result allocation when decoding into
359 # surrogate pairs on UCS-2 builds.
360 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
361 self.assertEqual('\U00010000' * 1024,
362 codecs.utf_32_decode(encoded_le)[0])
363 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
364 self.assertEqual('\U00010000' * 1024,
365 codecs.utf_32_decode(encoded_be)[0])
366
Walter Dörwald41980ca2007-08-16 21:55:45 +0000367class UTF32LETest(ReadTest):
368 encoding = "utf-32-le"
369
370 def test_partial(self):
371 self.check_partial(
372 "\x00\xff\u0100\uffff",
373 [
374 "",
375 "",
376 "",
377 "\x00",
378 "\x00",
379 "\x00",
380 "\x00",
381 "\x00\xff",
382 "\x00\xff",
383 "\x00\xff",
384 "\x00\xff",
385 "\x00\xff\u0100",
386 "\x00\xff\u0100",
387 "\x00\xff\u0100",
388 "\x00\xff\u0100",
389 "\x00\xff\u0100\uffff",
390 ]
391 )
392
393 def test_simple(self):
394 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
395
396 def test_errors(self):
397 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
398 b"\xff", "strict", True)
399
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000400 def test_issue8941(self):
401 # Issue #8941: insufficient result allocation when decoding into
402 # surrogate pairs on UCS-2 builds.
403 encoded = b'\x00\x00\x01\x00' * 1024
404 self.assertEqual('\U00010000' * 1024,
405 codecs.utf_32_le_decode(encoded)[0])
406
Walter Dörwald41980ca2007-08-16 21:55:45 +0000407class UTF32BETest(ReadTest):
408 encoding = "utf-32-be"
409
410 def test_partial(self):
411 self.check_partial(
412 "\x00\xff\u0100\uffff",
413 [
414 "",
415 "",
416 "",
417 "\x00",
418 "\x00",
419 "\x00",
420 "\x00",
421 "\x00\xff",
422 "\x00\xff",
423 "\x00\xff",
424 "\x00\xff",
425 "\x00\xff\u0100",
426 "\x00\xff\u0100",
427 "\x00\xff\u0100",
428 "\x00\xff\u0100",
429 "\x00\xff\u0100\uffff",
430 ]
431 )
432
433 def test_simple(self):
434 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
435
436 def test_errors(self):
437 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
438 b"\xff", "strict", True)
439
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000440 def test_issue8941(self):
441 # Issue #8941: insufficient result allocation when decoding into
442 # surrogate pairs on UCS-2 builds.
443 encoded = b'\x00\x01\x00\x00' * 1024
444 self.assertEqual('\U00010000' * 1024,
445 codecs.utf_32_be_decode(encoded)[0])
446
447
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000448class UTF16Test(ReadTest):
449 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000450
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000451 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
452 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000453
454 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000455 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000456 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000457 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200458 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000459 f.write("spam")
460 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000461 d = s.getvalue()
462 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000463 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000464 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000465 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200466 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000467 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000468
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000469 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000470 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200471 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000472 self.assertRaises(UnicodeError, f.read)
473
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000474 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200475 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000476 self.assertRaises(UnicodeError, f.read)
477
Walter Dörwald69652032004-09-07 20:24:22 +0000478 def test_partial(self):
479 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000480 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000481 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000482 "", # first byte of BOM read
483 "", # second byte of BOM read => byteorder known
484 "",
485 "\x00",
486 "\x00",
487 "\x00\xff",
488 "\x00\xff",
489 "\x00\xff\u0100",
490 "\x00\xff\u0100",
491 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000492 ]
493 )
494
Georg Brandl791f4e12009-09-17 11:41:24 +0000495 def test_handlers(self):
496 self.assertEqual(('\ufffd', 1),
497 codecs.utf_16_decode(b'\x01', 'replace', True))
498 self.assertEqual(('', 1),
499 codecs.utf_16_decode(b'\x01', 'ignore', True))
500
Walter Dörwalde22d3392005-11-17 08:52:34 +0000501 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000502 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000503 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000504
505 def test_decoder_state(self):
506 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000507 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000508 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000509 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000510
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000511 def test_bug691291(self):
512 # Files are always opened in binary mode, even if no binary mode was
513 # specified. This means that no automatic conversion of '\n' is done
514 # on reading and writing.
515 s1 = 'Hello\r\nworld\r\n'
516
517 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200518 self.addCleanup(support.unlink, support.TESTFN)
519 with open(support.TESTFN, 'wb') as fp:
520 fp.write(s)
Victor Stinner05010702011-05-27 16:50:40 +0200521 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200522 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000523
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000524class UTF16LETest(ReadTest):
525 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000526
527 def test_partial(self):
528 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000529 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000530 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000531 "",
532 "\x00",
533 "\x00",
534 "\x00\xff",
535 "\x00\xff",
536 "\x00\xff\u0100",
537 "\x00\xff\u0100",
538 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000539 ]
540 )
541
Walter Dörwalde22d3392005-11-17 08:52:34 +0000542 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000543 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000544 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000545
Victor Stinner53a9dd72010-12-08 22:25:45 +0000546 def test_nonbmp(self):
547 self.assertEqual("\U00010203".encode(self.encoding),
548 b'\x00\xd8\x03\xde')
549 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
550 "\U00010203")
551
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000552class UTF16BETest(ReadTest):
553 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000554
555 def test_partial(self):
556 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000557 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000558 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000559 "",
560 "\x00",
561 "\x00",
562 "\x00\xff",
563 "\x00\xff",
564 "\x00\xff\u0100",
565 "\x00\xff\u0100",
566 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000567 ]
568 )
569
Walter Dörwalde22d3392005-11-17 08:52:34 +0000570 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000571 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000572 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000573
Victor Stinner53a9dd72010-12-08 22:25:45 +0000574 def test_nonbmp(self):
575 self.assertEqual("\U00010203".encode(self.encoding),
576 b'\xd8\x00\xde\x03')
577 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
578 "\U00010203")
579
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000580class UTF8Test(ReadTest):
581 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000582
583 def test_partial(self):
584 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000585 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000586 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000587 "\x00",
588 "\x00",
589 "\x00\xff",
590 "\x00\xff",
591 "\x00\xff\u07ff",
592 "\x00\xff\u07ff",
593 "\x00\xff\u07ff",
594 "\x00\xff\u07ff\u0800",
595 "\x00\xff\u07ff\u0800",
596 "\x00\xff\u07ff\u0800",
597 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000598 ]
599 )
600
Walter Dörwald3abcb012007-04-16 22:10:50 +0000601 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000602 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000603 self.check_state_handling_decode(self.encoding,
604 u, u.encode(self.encoding))
605
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000606 def test_lone_surrogates(self):
607 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
608 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000609 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
610 b'[\\udc80]')
611 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
612 b'[&#56448;]')
613 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
614 b'[\x80]')
615 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
616 b'[]')
617 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
618 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000619
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000620 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000621 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
622 b"abc\xed\xa0\x80def")
623 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
624 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200625 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
626 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
627 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
628 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000629 self.assertTrue(codecs.lookup_error("surrogatepass"))
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000630
Walter Dörwalde22d3392005-11-17 08:52:34 +0000631class UTF7Test(ReadTest):
632 encoding = "utf-7"
633
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000634 def test_partial(self):
635 self.check_partial(
636 "a+-b",
637 [
638 "a",
639 "a",
640 "a+",
641 "a+-",
642 "a+-b",
643 ]
644 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000645
646class UTF16ExTest(unittest.TestCase):
647
648 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000649 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000650
651 def test_bad_args(self):
652 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
653
654class ReadBufferTest(unittest.TestCase):
655
656 def test_array(self):
657 import array
658 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000659 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000660 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000661 )
662
663 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000664 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000665
666 def test_bad_args(self):
667 self.assertRaises(TypeError, codecs.readbuffer_encode)
668 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
669
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000670class UTF8SigTest(ReadTest):
671 encoding = "utf-8-sig"
672
673 def test_partial(self):
674 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000675 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000676 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000677 "",
678 "",
679 "", # First BOM has been read and skipped
680 "",
681 "",
682 "\ufeff", # Second BOM has been read and emitted
683 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000684 "\ufeff\x00", # First byte of encoded "\xff" read
685 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
686 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
687 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000688 "\ufeff\x00\xff\u07ff",
689 "\ufeff\x00\xff\u07ff",
690 "\ufeff\x00\xff\u07ff\u0800",
691 "\ufeff\x00\xff\u07ff\u0800",
692 "\ufeff\x00\xff\u07ff\u0800",
693 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000694 ]
695 )
696
Thomas Wouters89f507f2006-12-13 04:49:30 +0000697 def test_bug1601501(self):
698 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000699 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000700
Walter Dörwald3abcb012007-04-16 22:10:50 +0000701 def test_bom(self):
702 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000703 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000704 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
705
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000706 def test_stream_bom(self):
707 unistring = "ABC\u00A1\u2200XYZ"
708 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
709
710 reader = codecs.getreader("utf-8-sig")
711 for sizehint in [None] + list(range(1, 11)) + \
712 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200713 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000714 ostream = io.StringIO()
715 while 1:
716 if sizehint is not None:
717 data = istream.read(sizehint)
718 else:
719 data = istream.read()
720
721 if not data:
722 break
723 ostream.write(data)
724
725 got = ostream.getvalue()
726 self.assertEqual(got, unistring)
727
728 def test_stream_bare(self):
729 unistring = "ABC\u00A1\u2200XYZ"
730 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
731
732 reader = codecs.getreader("utf-8-sig")
733 for sizehint in [None] + list(range(1, 11)) + \
734 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200735 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000736 ostream = io.StringIO()
737 while 1:
738 if sizehint is not None:
739 data = istream.read(sizehint)
740 else:
741 data = istream.read()
742
743 if not data:
744 break
745 ostream.write(data)
746
747 got = ostream.getvalue()
748 self.assertEqual(got, unistring)
749
750class EscapeDecodeTest(unittest.TestCase):
751 def test_empty(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000752 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000753
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000754class RecodingTest(unittest.TestCase):
755 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000756 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200757 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000758 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000759 f2.close()
760 # Python used to crash on this at exit because of a refcount
761 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000762
Martin v. Löwis2548c732003-04-18 10:39:54 +0000763# From RFC 3492
764punycode_testcases = [
765 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000766 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
767 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000768 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000769 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000770 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000771 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000772 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000773 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000774 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000775 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000776 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
777 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
778 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000779 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000780 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000781 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
782 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
783 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000784 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000785 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000786 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000787 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
788 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
789 "\u0939\u0948\u0902",
790 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000791
792 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000793 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000794 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
795 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000796
797 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000798 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
799 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
800 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000801 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
802 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000803
804 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000805 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
806 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
807 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
808 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000809 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000810
811 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000812 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
813 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
814 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
815 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
816 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000817 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000818
819 # (K) Vietnamese:
820 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
821 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000822 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
823 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
824 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
825 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000826 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000827
Martin v. Löwis2548c732003-04-18 10:39:54 +0000828 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000829 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000830 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000831
Martin v. Löwis2548c732003-04-18 10:39:54 +0000832 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000833 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
834 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
835 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000836 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000837
838 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000839 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
840 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
841 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000842 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000843
844 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000845 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000846 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000847
848 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000849 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
850 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000851 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000852
853 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000854 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000855 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000856
857 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000858 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000859 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000860
861 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000862 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
863 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000864 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000865 ]
866
867for i in punycode_testcases:
868 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000869 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000870
871class PunycodeTest(unittest.TestCase):
872 def test_encode(self):
873 for uni, puny in punycode_testcases:
874 # Need to convert both strings to lower case, since
875 # some of the extended encodings use upper case, but our
876 # code produces only lower case. Converting just puny to
877 # lower is also insufficient, since some of the input characters
878 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +0000879 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +0000880 str(uni.encode("punycode"), "ascii").lower(),
881 str(puny, "ascii").lower()
882 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000883
884 def test_decode(self):
885 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000886 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000887 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000888 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000889
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000890class UnicodeInternalTest(unittest.TestCase):
891 def test_bug1251300(self):
892 # Decoding with unicode_internal used to not correctly handle "code
893 # points" above 0x10ffff on UCS-4 builds.
894 if sys.maxunicode > 0xffff:
895 ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000896 (b"\x00\x10\xff\xff", "\U0010ffff"),
897 (b"\x00\x00\x01\x01", "\U00000101"),
898 (b"", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000899 ]
900 not_ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000901 b"\x7f\xff\xff\xff",
902 b"\x80\x00\x00\x00",
903 b"\x81\x00\x00\x00",
904 b"\x00",
905 b"\x00\x00\x00\x00\x00",
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000906 ]
907 for internal, uni in ok:
908 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000909 internal = bytes(reversed(internal))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000910 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000911 for internal in not_ok:
912 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000913 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000914 self.assertRaises(UnicodeDecodeError, internal.decode,
915 "unicode_internal")
916
917 def test_decode_error_attributes(self):
918 if sys.maxunicode > 0xffff:
919 try:
Walter Dörwald092a2252007-06-07 11:26:16 +0000920 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +0000921 except UnicodeDecodeError as ex:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000922 self.assertEqual("unicode_internal", ex.encoding)
923 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
924 self.assertEqual(4, ex.start)
925 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000926 else:
927 self.fail()
928
929 def test_decode_callback(self):
930 if sys.maxunicode > 0xffff:
931 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
932 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossum98297ee2007-11-06 21:34:58 +0000933 ab = "ab".encode("unicode_internal").decode()
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000934 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
935 "ascii"),
936 "UnicodeInternalTest")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000937 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000938
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000939 def test_encode_length(self):
940 # Issue 3739
941 encoder = codecs.getencoder("unicode_internal")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000942 self.assertEqual(encoder("a")[1], 1)
943 self.assertEqual(encoder("\xe9\u0142")[1], 2)
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000944
Ezio Melottib3aedd42010-11-20 19:04:17 +0000945 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +0000946
Martin v. Löwis2548c732003-04-18 10:39:54 +0000947# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
948nameprep_tests = [
949 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000950 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
951 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
952 b'\xb8\x8f\xef\xbb\xbf',
953 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000954 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000955 (b'CAFE',
956 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000957 # 3.3 Case folding 8bit U+00DF (german sharp s).
958 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000959 (b'\xc3\x9f',
960 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000961 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000962 (b'\xc4\xb0',
963 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000964 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000965 (b'\xc5\x83\xcd\xba',
966 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000967 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
968 # XXX: skip this as it fails in UCS-2 mode
969 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
970 # 'telc\xe2\x88\x95kg\xcf\x83'),
971 (None, None),
972 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000973 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
974 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000975 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000976 (b'\xe1\xbe\xb7',
977 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000978 # 3.9 Self-reverting case folding U+01F0 and normalization.
979 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000980 (b'\xc7\xb0',
981 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000982 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000983 (b'\xce\x90',
984 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000985 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000986 (b'\xce\xb0',
987 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000988 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000989 (b'\xe1\xba\x96',
990 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000991 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000992 (b'\xe1\xbd\x96',
993 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000994 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000995 (b' ',
996 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000997 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000998 (b'\xc2\xa0',
999 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001000 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001001 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001002 None),
1003 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001004 (b'\xe2\x80\x80',
1005 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001006 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001007 (b'\xe2\x80\x8b',
1008 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001009 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001010 (b'\xe3\x80\x80',
1011 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001012 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001013 (b'\x10\x7f',
1014 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001015 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001016 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001017 None),
1018 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001019 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001020 None),
1021 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001022 (b'\xef\xbb\xbf',
1023 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001024 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001025 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001026 None),
1027 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001028 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001029 None),
1030 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001031 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001032 None),
1033 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001034 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001035 None),
1036 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001037 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001038 None),
1039 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001040 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001041 None),
1042 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001043 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001044 None),
1045 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001046 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001047 None),
1048 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001049 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001050 None),
1051 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001052 (b'\xcd\x81',
1053 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001054 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001055 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001056 None),
1057 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001058 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001059 None),
1060 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001061 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001062 None),
1063 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001064 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001065 None),
1066 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001067 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001068 None),
1069 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001070 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001071 None),
1072 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001073 (b'foo\xef\xb9\xb6bar',
1074 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001075 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001076 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001077 None),
1078 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001079 (b'\xd8\xa71\xd8\xa8',
1080 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001081 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001082 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001083 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001084 # None),
1085 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001086 # 3.44 Larger test (shrinking).
1087 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001088 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1089 b'\xaa\xce\xb0\xe2\x80\x80',
1090 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001091 # 3.45 Larger test (expanding).
1092 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001093 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1094 b'\x80',
1095 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1096 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1097 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001098 ]
1099
1100
1101class NameprepTest(unittest.TestCase):
1102 def test_nameprep(self):
1103 from encodings.idna import nameprep
1104 for pos, (orig, prepped) in enumerate(nameprep_tests):
1105 if orig is None:
1106 # Skipped
1107 continue
1108 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001109 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001110 if prepped is None:
1111 # Input contains prohibited characters
1112 self.assertRaises(UnicodeError, nameprep, orig)
1113 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001114 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001115 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001116 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001117 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001118 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001119
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001120class IDNACodecTest(unittest.TestCase):
1121 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001122 self.assertEqual(str(b"python.org", "idna"), "python.org")
1123 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1124 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1125 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001126
1127 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001128 self.assertEqual("python.org".encode("idna"), b"python.org")
1129 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1130 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1131 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001132
Martin v. Löwis8b595142005-08-25 11:03:38 +00001133 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001134 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001135 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001136 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001137
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001138 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001139 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001140 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001141 "python.org"
1142 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001143 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001144 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001145 "python.org."
1146 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001147 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001148 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001149 "pyth\xf6n.org."
1150 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001151 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001152 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001153 "pyth\xf6n.org."
1154 )
1155
1156 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001157 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1158 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1159 self.assertEqual(decoder.decode(b"rg"), "")
1160 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001161
1162 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001163 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1164 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1165 self.assertEqual(decoder.decode(b"rg."), "org.")
1166 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001167
1168 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001169 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001170 b"".join(codecs.iterencode("python.org", "idna")),
1171 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001172 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001173 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001174 b"".join(codecs.iterencode("python.org.", "idna")),
1175 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001176 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001177 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001178 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1179 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001180 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001181 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001182 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1183 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001184 )
1185
1186 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001187 self.assertEqual(encoder.encode("\xe4x"), b"")
1188 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1189 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001190
1191 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001192 self.assertEqual(encoder.encode("\xe4x"), b"")
1193 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1194 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001195
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001196class CodecsModuleTest(unittest.TestCase):
1197
1198 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001199 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1200 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001201 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001202 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001203 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001204
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001205 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001206 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1207 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001208 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001209 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001210 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001211 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001212
1213 def test_register(self):
1214 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001215 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001216
1217 def test_lookup(self):
1218 self.assertRaises(TypeError, codecs.lookup)
1219 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001220 self.assertRaises(LookupError, codecs.lookup, " ")
1221
1222 def test_getencoder(self):
1223 self.assertRaises(TypeError, codecs.getencoder)
1224 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1225
1226 def test_getdecoder(self):
1227 self.assertRaises(TypeError, codecs.getdecoder)
1228 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1229
1230 def test_getreader(self):
1231 self.assertRaises(TypeError, codecs.getreader)
1232 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1233
1234 def test_getwriter(self):
1235 self.assertRaises(TypeError, codecs.getwriter)
1236 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001237
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001238 def test_lookup_issue1813(self):
1239 # Issue #1813: under Turkish locales, lookup of some codecs failed
1240 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001241 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001242 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1243 try:
1244 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1245 except locale.Error:
1246 # Unsupported locale on this system
1247 self.skipTest('test needs Turkish locale')
1248 c = codecs.lookup('ASCII')
1249 self.assertEqual(c.name, 'ascii')
1250
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001251class StreamReaderTest(unittest.TestCase):
1252
1253 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001254 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001255 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001256
1257 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001258 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001259 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001260
Thomas Wouters89f507f2006-12-13 04:49:30 +00001261class EncodedFileTest(unittest.TestCase):
1262
1263 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001264 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001265 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001266 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001267
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001268 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001269 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001270 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001271 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001272
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001273all_unicode_encodings = [
1274 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001275 "big5",
1276 "big5hkscs",
1277 "charmap",
1278 "cp037",
1279 "cp1006",
1280 "cp1026",
1281 "cp1140",
1282 "cp1250",
1283 "cp1251",
1284 "cp1252",
1285 "cp1253",
1286 "cp1254",
1287 "cp1255",
1288 "cp1256",
1289 "cp1257",
1290 "cp1258",
1291 "cp424",
1292 "cp437",
1293 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001294 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001295 "cp737",
1296 "cp775",
1297 "cp850",
1298 "cp852",
1299 "cp855",
1300 "cp856",
1301 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001302 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001303 "cp860",
1304 "cp861",
1305 "cp862",
1306 "cp863",
1307 "cp864",
1308 "cp865",
1309 "cp866",
1310 "cp869",
1311 "cp874",
1312 "cp875",
1313 "cp932",
1314 "cp949",
1315 "cp950",
1316 "euc_jis_2004",
1317 "euc_jisx0213",
1318 "euc_jp",
1319 "euc_kr",
1320 "gb18030",
1321 "gb2312",
1322 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001323 "hp_roman8",
1324 "hz",
1325 "idna",
1326 "iso2022_jp",
1327 "iso2022_jp_1",
1328 "iso2022_jp_2",
1329 "iso2022_jp_2004",
1330 "iso2022_jp_3",
1331 "iso2022_jp_ext",
1332 "iso2022_kr",
1333 "iso8859_1",
1334 "iso8859_10",
1335 "iso8859_11",
1336 "iso8859_13",
1337 "iso8859_14",
1338 "iso8859_15",
1339 "iso8859_16",
1340 "iso8859_2",
1341 "iso8859_3",
1342 "iso8859_4",
1343 "iso8859_5",
1344 "iso8859_6",
1345 "iso8859_7",
1346 "iso8859_8",
1347 "iso8859_9",
1348 "johab",
1349 "koi8_r",
1350 "koi8_u",
1351 "latin_1",
1352 "mac_cyrillic",
1353 "mac_greek",
1354 "mac_iceland",
1355 "mac_latin2",
1356 "mac_roman",
1357 "mac_turkish",
1358 "palmos",
1359 "ptcp154",
1360 "punycode",
1361 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001362 "shift_jis",
1363 "shift_jis_2004",
1364 "shift_jisx0213",
1365 "tis_620",
1366 "unicode_escape",
1367 "unicode_internal",
1368 "utf_16",
1369 "utf_16_be",
1370 "utf_16_le",
1371 "utf_7",
1372 "utf_8",
1373]
1374
1375if hasattr(codecs, "mbcs_encode"):
1376 all_unicode_encodings.append("mbcs")
1377
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001378# The following encoding is not tested, because it's not supposed
1379# to work:
1380# "undefined"
1381
1382# The following encodings don't work in stateful mode
1383broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001384 "punycode",
1385 "unicode_internal"
1386]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001387broken_incremental_coders = broken_unicode_with_streams + [
1388 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001389]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001390
Walter Dörwald3abcb012007-04-16 22:10:50 +00001391class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001392 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001393 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001394 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001395 name = codecs.lookup(encoding).name
1396 if encoding.endswith("_codec"):
1397 name += "_codec"
1398 elif encoding == "latin_1":
1399 name = "latin_1"
1400 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001401 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001402 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001403 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001404 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1405
1406 if encoding not in broken_unicode_with_streams:
1407 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001408 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001409 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001410 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001411 for c in s:
1412 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001413 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001414 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001415 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001416 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001417 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001418 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001419 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001420 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001421 decodedresult += reader.read()
1422 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1423
Thomas Wouters89f507f2006-12-13 04:49:30 +00001424 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001425 # check incremental decoder/encoder (fetched via the Python
1426 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001427 try:
1428 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001429 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001430 except LookupError: # no IncrementalEncoder
1431 pass
1432 else:
1433 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001434 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001435 for c in s:
1436 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001437 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001438 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001439 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001440 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001441 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001442 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001443 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1444
1445 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001446 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001447 for c in s:
1448 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001449 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001450 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001451 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001452 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001453 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001454 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001455 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1456
1457 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001458 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001459 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1460
1461 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001462 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1463 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001464
Victor Stinner554f3f02010-06-16 23:33:54 +00001465 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001466 # check incremental decoder/encoder with errors argument
1467 try:
1468 encoder = codecs.getincrementalencoder(encoding)("ignore")
1469 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1470 except LookupError: # no IncrementalEncoder
1471 pass
1472 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001473 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001474 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001475 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001476 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1477
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001478 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001479 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001480 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001481 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1482
Walter Dörwald729c31f2005-03-14 19:06:30 +00001483 def test_seek(self):
1484 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001485 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001486 for encoding in all_unicode_encodings:
1487 if encoding == "idna": # FIXME: See SF bug #1163178
1488 continue
1489 if encoding in broken_unicode_with_streams:
1490 continue
Victor Stinner05010702011-05-27 16:50:40 +02001491 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001492 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001493 # Test that calling seek resets the internal codec state and buffers
1494 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001495 data = reader.read()
1496 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001497
Walter Dörwalde22d3392005-11-17 08:52:34 +00001498 def test_bad_decode_args(self):
1499 for encoding in all_unicode_encodings:
1500 decoder = codecs.getdecoder(encoding)
1501 self.assertRaises(TypeError, decoder)
1502 if encoding not in ("idna", "punycode"):
1503 self.assertRaises(TypeError, decoder, 42)
1504
1505 def test_bad_encode_args(self):
1506 for encoding in all_unicode_encodings:
1507 encoder = codecs.getencoder(encoding)
1508 self.assertRaises(TypeError, encoder)
1509
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001510 def test_encoding_map_type_initialized(self):
1511 from encodings import cp1140
1512 # This used to crash, we are only verifying there's no crash.
1513 table_type = type(cp1140.encoding_table)
1514 self.assertEqual(table_type, table_type)
1515
Walter Dörwald3abcb012007-04-16 22:10:50 +00001516 def test_decoder_state(self):
1517 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001518 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001519 for encoding in all_unicode_encodings:
1520 if encoding not in broken_incremental_coders:
1521 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1522 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1523
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001524class CharmapTest(unittest.TestCase):
1525 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001526 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001527 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001528 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001529 )
1530
Ezio Melottib3aedd42010-11-20 19:04:17 +00001531 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001532 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001533 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001534 )
1535
Ezio Melottib3aedd42010-11-20 19:04:17 +00001536 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001537 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001538 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001539 )
1540
Ezio Melottib3aedd42010-11-20 19:04:17 +00001541 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001542 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001543 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001544 )
1545
Ezio Melottib3aedd42010-11-20 19:04:17 +00001546 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001547 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001548 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001549 )
1550
Guido van Rossum805365e2007-05-07 22:24:25 +00001551 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001552 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001553 codecs.charmap_decode(allbytes, "ignore", ""),
1554 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001555 )
1556
Thomas Wouters89f507f2006-12-13 04:49:30 +00001557class WithStmtTest(unittest.TestCase):
1558 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001559 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02001560 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1561 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001562
1563 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001564 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001565 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02001566 with codecs.StreamReaderWriter(f, info.streamreader,
1567 info.streamwriter, 'strict') as srw:
1568 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001569
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001570class TypesTest(unittest.TestCase):
1571 def test_decode_unicode(self):
1572 # Most decoders don't accept unicode input
1573 decoders = [
1574 codecs.utf_7_decode,
1575 codecs.utf_8_decode,
1576 codecs.utf_16_le_decode,
1577 codecs.utf_16_be_decode,
1578 codecs.utf_16_ex_decode,
1579 codecs.utf_32_decode,
1580 codecs.utf_32_le_decode,
1581 codecs.utf_32_be_decode,
1582 codecs.utf_32_ex_decode,
1583 codecs.latin_1_decode,
1584 codecs.ascii_decode,
1585 codecs.charmap_decode,
1586 ]
1587 if hasattr(codecs, "mbcs_decode"):
1588 decoders.append(codecs.mbcs_decode)
1589 for decoder in decoders:
1590 self.assertRaises(TypeError, decoder, "xxx")
1591
1592 def test_unicode_escape(self):
1593 # Escape-decoding an unicode string is supported ang gives the same
1594 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001595 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1596 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1597 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1598 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001599
Martin v. Löwis43c57782009-05-10 08:15:24 +00001600class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001601
1602 def test_utf8(self):
1603 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001604 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001605 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001606 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001607 b"foo\x80bar")
1608 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001609 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001610 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001611 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001612 b"\xed\xb0\x80")
1613
1614 def test_ascii(self):
1615 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001616 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001617 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001618 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001619 b"foo\x80bar")
1620
1621 def test_charmap(self):
1622 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001623 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001624 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001625 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001626 b"foo\xa5bar")
1627
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001628 def test_latin1(self):
1629 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001630 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001631 b"\xe4\xeb\xef\xf6\xfc")
1632
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001633
Victor Stinner3fed0872010-05-22 02:16:27 +00001634class BomTest(unittest.TestCase):
1635 def test_seek0(self):
1636 data = "1234567890"
1637 tests = ("utf-16",
1638 "utf-16-le",
1639 "utf-16-be",
1640 "utf-32",
1641 "utf-32-le",
1642 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02001643 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00001644 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001645 # Check if the BOM is written only once
1646 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001647 f.write(data)
1648 f.write(data)
1649 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001650 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001651 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001652 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001653
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001654 # Check that the BOM is written after a seek(0)
1655 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1656 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001657 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001658 f.seek(0)
1659 f.write(data)
1660 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001661 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001662
1663 # (StreamWriter) Check that the BOM is written after a seek(0)
1664 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02001665 f.writer.write(data[0])
1666 self.assertNotEqual(f.writer.tell(), 0)
1667 f.writer.seek(0)
1668 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001669 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001670 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001671
Victor Stinner05010702011-05-27 16:50:40 +02001672 # Check that the BOM is not written after a seek() at a position
1673 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001674 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1675 f.write(data)
1676 f.seek(f.tell())
1677 f.write(data)
1678 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001679 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001680
Victor Stinner05010702011-05-27 16:50:40 +02001681 # (StreamWriter) Check that the BOM is not written after a seek()
1682 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001683 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02001684 f.writer.write(data)
1685 f.writer.seek(f.writer.tell())
1686 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001687 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001688 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001689
Victor Stinner3fed0872010-05-22 02:16:27 +00001690
Georg Brandl02524622010-12-02 18:06:51 +00001691bytes_transform_encodings = [
1692 "base64_codec",
1693 "uu_codec",
1694 "quopri_codec",
1695 "hex_codec",
1696]
1697try:
1698 import zlib
1699except ImportError:
1700 pass
1701else:
1702 bytes_transform_encodings.append("zlib_codec")
1703try:
1704 import bz2
1705except ImportError:
1706 pass
1707else:
1708 bytes_transform_encodings.append("bz2_codec")
1709
1710class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001711
Georg Brandl02524622010-12-02 18:06:51 +00001712 def test_basics(self):
1713 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00001714 for encoding in bytes_transform_encodings:
1715 # generic codecs interface
1716 (o, size) = codecs.getencoder(encoding)(binput)
1717 self.assertEqual(size, len(binput))
1718 (i, size) = codecs.getdecoder(encoding)(o)
1719 self.assertEqual(size, len(o))
1720 self.assertEqual(i, binput)
1721
Georg Brandl02524622010-12-02 18:06:51 +00001722 def test_read(self):
1723 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001724 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02001725 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00001726 sout = reader.read()
1727 self.assertEqual(sout, b"\x80")
1728
1729 def test_readline(self):
1730 for encoding in bytes_transform_encodings:
1731 if encoding in ['uu_codec', 'zlib_codec']:
1732 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001733 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02001734 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00001735 sout = reader.readline()
1736 self.assertEqual(sout, b"\x80")
1737
1738
Fred Drake2e2be372001-09-20 21:33:42 +00001739def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001740 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00001741 UTF32Test,
1742 UTF32LETest,
1743 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001744 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001745 UTF16LETest,
1746 UTF16BETest,
1747 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001748 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001749 UTF7Test,
1750 UTF16ExTest,
1751 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001752 RecodingTest,
1753 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001754 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001755 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001756 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001757 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001758 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001759 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001760 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001761 CharmapTest,
1762 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001763 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001764 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00001765 BomTest,
Georg Brandl02524622010-12-02 18:06:51 +00001766 TransformCodecTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001767 )
Fred Drake2e2be372001-09-20 21:33:42 +00001768
1769
1770if __name__ == "__main__":
1771 test_main()