blob: eab07c9887558da7394cb47a769625adeae63f79 [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
Nick Coghlanc72e4e62013-11-22 22:39:36 +10008import encodings
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020012if sys.platform == 'win32':
13 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
14else:
15 VISTA_OR_LATER = False
16
Antoine Pitrou00b2c862011-10-05 13:01:41 +020017try:
18 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Walter Dörwald69652032004-09-07 20:24:22 +000030class Queue(object):
31 """
32 queue: write bytes at one end, read bytes from the other end
33 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000034 def __init__(self, buffer):
35 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000036
37 def write(self, chars):
38 self._buffer += chars
39
40 def read(self, size=-1):
41 if size<0:
42 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000043 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000044 return s
45 else:
46 s = self._buffer[:size]
47 self._buffer = self._buffer[size:]
48 return s
49
Walter Dörwald3abcb012007-04-16 22:10:50 +000050class MixInCheckStateHandling:
51 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000052 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000053 d = codecs.getincrementaldecoder(encoding)()
54 part1 = d.decode(s[:i])
55 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000056 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000057 # Check that the condition stated in the documentation for
58 # IncrementalDecoder.getstate() holds
59 if not state[1]:
60 # reset decoder to the default state without anything buffered
61 d.setstate((state[0][:0], 0))
62 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000063 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000064 # The decoder must return to the same state
65 self.assertEqual(state, d.getstate())
66 # Create a new decoder and set it to the state
67 # we extracted from the old one
68 d = codecs.getincrementaldecoder(encoding)()
69 d.setstate(state)
70 part2 = d.decode(s[i:], True)
71 self.assertEqual(u, part1+part2)
72
73 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000074 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000075 d = codecs.getincrementalencoder(encoding)()
76 part1 = d.encode(u[:i])
77 state = d.getstate()
78 d = codecs.getincrementalencoder(encoding)()
79 d.setstate(state)
80 part2 = d.encode(u[i:], True)
81 self.assertEqual(s, part1+part2)
82
Ezio Melotti5d3dba02013-01-11 06:02:07 +020083class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000084 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000085 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000086 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000087 # the StreamReader and check that the results equal the appropriate
88 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000089 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020090 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000091 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000092 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000093 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000094 result += r.read()
95 self.assertEqual(result, partialresult)
96 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000097 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000098 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000099
Thomas Woutersa9773292006-04-21 09:43:23 +0000100 # do the check again, this time using a incremental decoder
101 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000102 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000103 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 self.assertEqual(result, partialresult)
106 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000107 self.assertEqual(d.decode(b"", True), "")
108 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000109
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000110 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000111 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000112 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000113 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000114 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000115 self.assertEqual(result, partialresult)
116 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000117 self.assertEqual(d.decode(b"", True), "")
118 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000119
120 # check iterdecode()
121 encoded = input.encode(self.encoding)
122 self.assertEqual(
123 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000124 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000125 )
126
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000127 def test_readline(self):
128 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000129 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000130 return codecs.getreader(self.encoding)(stream)
131
Walter Dörwaldca199432006-03-06 22:39:12 +0000132 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200133 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000134 lines = []
135 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000136 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000137 if not line:
138 break
139 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000140 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000141
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000142 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
143 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
144 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000145 self.assertEqual(readalllines(s, True), sexpected)
146 self.assertEqual(readalllines(s, False), sexpectednoends)
147 self.assertEqual(readalllines(s, True, 10), sexpected)
148 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000149
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200150 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000151 # Test long lines (multiple calls to read() in readline())
152 vw = []
153 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200154 for (i, lineend) in enumerate(lineends):
155 vw.append((i*200+200)*"\u3042" + lineend)
156 vwo.append((i*200+200)*"\u3042")
157 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
158 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000159
160 # Test lines where the first read might end with \r, so the
161 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000162 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200163 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000164 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000165 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000166 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000167 self.assertEqual(
168 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000169 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200171 self.assertEqual(
172 reader.readline(keepends=True),
173 "xxx\n",
174 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000175 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000176 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000177 self.assertEqual(
178 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000179 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000180 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200181 self.assertEqual(
182 reader.readline(keepends=False),
183 "xxx",
184 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000185
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200186 def test_mixed_readline_and_read(self):
187 lines = ["Humpty Dumpty sat on a wall,\n",
188 "Humpty Dumpty had a great fall.\r\n",
189 "All the king's horses and all the king's men\r",
190 "Couldn't put Humpty together again."]
191 data = ''.join(lines)
192 def getreader():
193 stream = io.BytesIO(data.encode(self.encoding))
194 return codecs.getreader(self.encoding)(stream)
195
196 # Issue #8260: Test readline() followed by read()
197 f = getreader()
198 self.assertEqual(f.readline(), lines[0])
199 self.assertEqual(f.read(), ''.join(lines[1:]))
200 self.assertEqual(f.read(), '')
201
202 # Issue #16636: Test readline() followed by readlines()
203 f = getreader()
204 self.assertEqual(f.readline(), lines[0])
205 self.assertEqual(f.readlines(), lines[1:])
206 self.assertEqual(f.read(), '')
207
208 # Test read() followed by read()
209 f = getreader()
210 self.assertEqual(f.read(size=40, chars=5), data[:5])
211 self.assertEqual(f.read(), data[5:])
212 self.assertEqual(f.read(), '')
213
214 # Issue #12446: Test read() followed by readlines()
215 f = getreader()
216 self.assertEqual(f.read(size=40, chars=5), data[:5])
217 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
218 self.assertEqual(f.read(), '')
219
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000220 def test_bug1175396(self):
221 s = [
222 '<%!--===================================================\r\n',
223 ' BLOG index page: show recent articles,\r\n',
224 ' today\'s articles, or articles of a specific date.\r\n',
225 '========================================================--%>\r\n',
226 '<%@inputencoding="ISO-8859-1"%>\r\n',
227 '<%@pagetemplate=TEMPLATE.y%>\r\n',
228 '<%@import=import frog.util, frog%>\r\n',
229 '<%@import=import frog.objects%>\r\n',
230 '<%@import=from frog.storageerrors import StorageError%>\r\n',
231 '<%\r\n',
232 '\r\n',
233 'import logging\r\n',
234 'log=logging.getLogger("Snakelets.logger")\r\n',
235 '\r\n',
236 '\r\n',
237 'user=self.SessionCtx.user\r\n',
238 'storageEngine=self.SessionCtx.storageEngine\r\n',
239 '\r\n',
240 '\r\n',
241 'def readArticlesFromDate(date, count=None):\r\n',
242 ' entryids=storageEngine.listBlogEntries(date)\r\n',
243 ' entryids.reverse() # descending\r\n',
244 ' if count:\r\n',
245 ' entryids=entryids[:count]\r\n',
246 ' try:\r\n',
247 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
248 ' except StorageError,x:\r\n',
249 ' log.error("Error loading articles: "+str(x))\r\n',
250 ' self.abort("cannot load articles")\r\n',
251 '\r\n',
252 'showdate=None\r\n',
253 '\r\n',
254 'arg=self.Request.getArg()\r\n',
255 'if arg=="today":\r\n',
256 ' #-------------------- TODAY\'S ARTICLES\r\n',
257 ' self.write("<h2>Today\'s articles</h2>")\r\n',
258 ' showdate = frog.util.isodatestr() \r\n',
259 ' entries = readArticlesFromDate(showdate)\r\n',
260 'elif arg=="active":\r\n',
261 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
262 ' self.Yredirect("active.y")\r\n',
263 'elif arg=="login":\r\n',
264 ' #-------------------- LOGIN PAGE redirect\r\n',
265 ' self.Yredirect("login.y")\r\n',
266 'elif arg=="date":\r\n',
267 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
268 ' showdate = self.Request.getParameter("date")\r\n',
269 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
270 ' entries = readArticlesFromDate(showdate)\r\n',
271 'else:\r\n',
272 ' #-------------------- RECENT ARTICLES\r\n',
273 ' self.write("<h2>Recent articles</h2>")\r\n',
274 ' dates=storageEngine.listBlogEntryDates()\r\n',
275 ' if dates:\r\n',
276 ' entries=[]\r\n',
277 ' SHOWAMOUNT=10\r\n',
278 ' for showdate in dates:\r\n',
279 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
280 ' if len(entries)>=SHOWAMOUNT:\r\n',
281 ' break\r\n',
282 ' \r\n',
283 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000284 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200285 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000286 for (i, line) in enumerate(reader):
287 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000288
289 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000290 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200291 writer = codecs.getwriter(self.encoding)(q)
292 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000293
294 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000295 writer.write("foo\r")
296 self.assertEqual(reader.readline(keepends=False), "foo")
297 writer.write("\nbar\r")
298 self.assertEqual(reader.readline(keepends=False), "")
299 self.assertEqual(reader.readline(keepends=False), "bar")
300 writer.write("baz")
301 self.assertEqual(reader.readline(keepends=False), "baz")
302 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000303
304 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000305 writer.write("foo\r")
306 self.assertEqual(reader.readline(keepends=True), "foo\r")
307 writer.write("\nbar\r")
308 self.assertEqual(reader.readline(keepends=True), "\n")
309 self.assertEqual(reader.readline(keepends=True), "bar\r")
310 writer.write("baz")
311 self.assertEqual(reader.readline(keepends=True), "baz")
312 self.assertEqual(reader.readline(keepends=True), "")
313 writer.write("foo\r\n")
314 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000315
Walter Dörwald9fa09462005-01-10 12:01:39 +0000316 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000317 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
318 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
319 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000320
321 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000322 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200323 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000324 self.assertEqual(reader.readline(), s1)
325 self.assertEqual(reader.readline(), s2)
326 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000327 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000328
329 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000330 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
331 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
332 s3 = "stillokay:bbbbxx\r\n"
333 s4 = "broken!!!!badbad\r\n"
334 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000335
336 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000337 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200338 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000339 self.assertEqual(reader.readline(), s1)
340 self.assertEqual(reader.readline(), s2)
341 self.assertEqual(reader.readline(), s3)
342 self.assertEqual(reader.readline(), s4)
343 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000344 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000345
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200346 ill_formed_sequence_replace = "\ufffd"
347
348 def test_lone_surrogates(self):
349 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
350 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
351 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200352 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
353 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200354 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
355 "[&#56448;]".encode(self.encoding))
356 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
357 "[]".encode(self.encoding))
358 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
359 "[?]".encode(self.encoding))
360
361 bom = "".encode(self.encoding)
362 for before, after in [("\U00010fff", "A"), ("[", "]"),
363 ("A", "\U00010fff")]:
364 before_sequence = before.encode(self.encoding)[len(bom):]
365 after_sequence = after.encode(self.encoding)[len(bom):]
366 test_string = before + "\uDC80" + after
367 test_sequence = (bom + before_sequence +
368 self.ill_formed_sequence + after_sequence)
369 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
370 self.encoding)
371 self.assertEqual(test_string.encode(self.encoding,
372 "surrogatepass"),
373 test_sequence)
374 self.assertEqual(test_sequence.decode(self.encoding,
375 "surrogatepass"),
376 test_string)
377 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
378 before + after)
379 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
380 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200381 backslashreplace = ''.join('\\x%02x' % b
382 for b in self.ill_formed_sequence)
383 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
384 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200385
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200386class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000387 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200388 if sys.byteorder == 'little':
389 ill_formed_sequence = b"\x80\xdc\x00\x00"
390 else:
391 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000392
393 spamle = (b'\xff\xfe\x00\x00'
394 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
395 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
396 spambe = (b'\x00\x00\xfe\xff'
397 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
398 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
399
400 def test_only_one_bom(self):
401 _,_,reader,writer = codecs.lookup(self.encoding)
402 # encode some stream
403 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200404 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000405 f.write("spam")
406 f.write("spam")
407 d = s.getvalue()
408 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000409 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000410 # try to read it back
411 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200412 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000413 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000414
415 def test_badbom(self):
416 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200417 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000418 self.assertRaises(UnicodeError, f.read)
419
420 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200421 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000422 self.assertRaises(UnicodeError, f.read)
423
424 def test_partial(self):
425 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200426 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000427 [
428 "", # first byte of BOM read
429 "", # second byte of BOM read
430 "", # third byte of BOM read
431 "", # fourth byte of BOM read => byteorder known
432 "",
433 "",
434 "",
435 "\x00",
436 "\x00",
437 "\x00",
438 "\x00",
439 "\x00\xff",
440 "\x00\xff",
441 "\x00\xff",
442 "\x00\xff",
443 "\x00\xff\u0100",
444 "\x00\xff\u0100",
445 "\x00\xff\u0100",
446 "\x00\xff\u0100",
447 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200448 "\x00\xff\u0100\uffff",
449 "\x00\xff\u0100\uffff",
450 "\x00\xff\u0100\uffff",
451 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000452 ]
453 )
454
Georg Brandl791f4e12009-09-17 11:41:24 +0000455 def test_handlers(self):
456 self.assertEqual(('\ufffd', 1),
457 codecs.utf_32_decode(b'\x01', 'replace', True))
458 self.assertEqual(('', 1),
459 codecs.utf_32_decode(b'\x01', 'ignore', True))
460
Walter Dörwald41980ca2007-08-16 21:55:45 +0000461 def test_errors(self):
462 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
463 b"\xff", "strict", True)
464
465 def test_decoder_state(self):
466 self.check_state_handling_decode(self.encoding,
467 "spamspam", self.spamle)
468 self.check_state_handling_decode(self.encoding,
469 "spamspam", self.spambe)
470
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000471 def test_issue8941(self):
472 # Issue #8941: insufficient result allocation when decoding into
473 # surrogate pairs on UCS-2 builds.
474 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
475 self.assertEqual('\U00010000' * 1024,
476 codecs.utf_32_decode(encoded_le)[0])
477 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
478 self.assertEqual('\U00010000' * 1024,
479 codecs.utf_32_decode(encoded_be)[0])
480
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200481class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000482 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200483 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000484
485 def test_partial(self):
486 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200487 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000488 [
489 "",
490 "",
491 "",
492 "\x00",
493 "\x00",
494 "\x00",
495 "\x00",
496 "\x00\xff",
497 "\x00\xff",
498 "\x00\xff",
499 "\x00\xff",
500 "\x00\xff\u0100",
501 "\x00\xff\u0100",
502 "\x00\xff\u0100",
503 "\x00\xff\u0100",
504 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200505 "\x00\xff\u0100\uffff",
506 "\x00\xff\u0100\uffff",
507 "\x00\xff\u0100\uffff",
508 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000509 ]
510 )
511
512 def test_simple(self):
513 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
514
515 def test_errors(self):
516 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
517 b"\xff", "strict", True)
518
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000519 def test_issue8941(self):
520 # Issue #8941: insufficient result allocation when decoding into
521 # surrogate pairs on UCS-2 builds.
522 encoded = b'\x00\x00\x01\x00' * 1024
523 self.assertEqual('\U00010000' * 1024,
524 codecs.utf_32_le_decode(encoded)[0])
525
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200526class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000527 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200528 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000529
530 def test_partial(self):
531 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200532 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000533 [
534 "",
535 "",
536 "",
537 "\x00",
538 "\x00",
539 "\x00",
540 "\x00",
541 "\x00\xff",
542 "\x00\xff",
543 "\x00\xff",
544 "\x00\xff",
545 "\x00\xff\u0100",
546 "\x00\xff\u0100",
547 "\x00\xff\u0100",
548 "\x00\xff\u0100",
549 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200550 "\x00\xff\u0100\uffff",
551 "\x00\xff\u0100\uffff",
552 "\x00\xff\u0100\uffff",
553 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000554 ]
555 )
556
557 def test_simple(self):
558 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
559
560 def test_errors(self):
561 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
562 b"\xff", "strict", True)
563
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000564 def test_issue8941(self):
565 # Issue #8941: insufficient result allocation when decoding into
566 # surrogate pairs on UCS-2 builds.
567 encoded = b'\x00\x01\x00\x00' * 1024
568 self.assertEqual('\U00010000' * 1024,
569 codecs.utf_32_be_decode(encoded)[0])
570
571
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200572class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000573 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200574 if sys.byteorder == 'little':
575 ill_formed_sequence = b"\x80\xdc"
576 else:
577 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000578
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000579 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
580 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000581
582 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000583 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000584 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000585 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200586 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000587 f.write("spam")
588 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000589 d = s.getvalue()
590 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000591 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000592 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000593 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200594 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000595 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000596
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000597 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000598 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200599 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000600 self.assertRaises(UnicodeError, f.read)
601
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000602 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200603 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000604 self.assertRaises(UnicodeError, f.read)
605
Walter Dörwald69652032004-09-07 20:24:22 +0000606 def test_partial(self):
607 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200608 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000609 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000610 "", # first byte of BOM read
611 "", # second byte of BOM read => byteorder known
612 "",
613 "\x00",
614 "\x00",
615 "\x00\xff",
616 "\x00\xff",
617 "\x00\xff\u0100",
618 "\x00\xff\u0100",
619 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200620 "\x00\xff\u0100\uffff",
621 "\x00\xff\u0100\uffff",
622 "\x00\xff\u0100\uffff",
623 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000624 ]
625 )
626
Georg Brandl791f4e12009-09-17 11:41:24 +0000627 def test_handlers(self):
628 self.assertEqual(('\ufffd', 1),
629 codecs.utf_16_decode(b'\x01', 'replace', True))
630 self.assertEqual(('', 1),
631 codecs.utf_16_decode(b'\x01', 'ignore', True))
632
Walter Dörwalde22d3392005-11-17 08:52:34 +0000633 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000634 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000635 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000636
637 def test_decoder_state(self):
638 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000639 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000640 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000641 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000642
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000643 def test_bug691291(self):
644 # Files are always opened in binary mode, even if no binary mode was
645 # specified. This means that no automatic conversion of '\n' is done
646 # on reading and writing.
647 s1 = 'Hello\r\nworld\r\n'
648
649 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200650 self.addCleanup(support.unlink, support.TESTFN)
651 with open(support.TESTFN, 'wb') as fp:
652 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200653 with support.check_warnings(('', DeprecationWarning)):
654 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
655 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200656 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000657
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200658class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000659 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200660 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000661
662 def test_partial(self):
663 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200664 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000665 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000666 "",
667 "\x00",
668 "\x00",
669 "\x00\xff",
670 "\x00\xff",
671 "\x00\xff\u0100",
672 "\x00\xff\u0100",
673 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200674 "\x00\xff\u0100\uffff",
675 "\x00\xff\u0100\uffff",
676 "\x00\xff\u0100\uffff",
677 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000678 ]
679 )
680
Walter Dörwalde22d3392005-11-17 08:52:34 +0000681 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200682 tests = [
683 (b'\xff', '\ufffd'),
684 (b'A\x00Z', 'A\ufffd'),
685 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
686 (b'\x00\xd8', '\ufffd'),
687 (b'\x00\xd8A', '\ufffd'),
688 (b'\x00\xd8A\x00', '\ufffdA'),
689 (b'\x00\xdcA\x00', '\ufffdA'),
690 ]
691 for raw, expected in tests:
692 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
693 raw, 'strict', True)
694 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000695
Victor Stinner53a9dd72010-12-08 22:25:45 +0000696 def test_nonbmp(self):
697 self.assertEqual("\U00010203".encode(self.encoding),
698 b'\x00\xd8\x03\xde')
699 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
700 "\U00010203")
701
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200702class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000703 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200704 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000705
706 def test_partial(self):
707 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200708 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000709 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000710 "",
711 "\x00",
712 "\x00",
713 "\x00\xff",
714 "\x00\xff",
715 "\x00\xff\u0100",
716 "\x00\xff\u0100",
717 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200718 "\x00\xff\u0100\uffff",
719 "\x00\xff\u0100\uffff",
720 "\x00\xff\u0100\uffff",
721 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000722 ]
723 )
724
Walter Dörwalde22d3392005-11-17 08:52:34 +0000725 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200726 tests = [
727 (b'\xff', '\ufffd'),
728 (b'\x00A\xff', 'A\ufffd'),
729 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
730 (b'\xd8\x00', '\ufffd'),
731 (b'\xd8\x00\xdc', '\ufffd'),
732 (b'\xd8\x00\x00A', '\ufffdA'),
733 (b'\xdc\x00\x00A', '\ufffdA'),
734 ]
735 for raw, expected in tests:
736 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
737 raw, 'strict', True)
738 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000739
Victor Stinner53a9dd72010-12-08 22:25:45 +0000740 def test_nonbmp(self):
741 self.assertEqual("\U00010203".encode(self.encoding),
742 b'\xd8\x00\xde\x03')
743 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
744 "\U00010203")
745
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200746class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000747 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200748 ill_formed_sequence = b"\xed\xb2\x80"
749 ill_formed_sequence_replace = "\ufffd" * 3
Walter Dörwald69652032004-09-07 20:24:22 +0000750
751 def test_partial(self):
752 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200753 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000754 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000755 "\x00",
756 "\x00",
757 "\x00\xff",
758 "\x00\xff",
759 "\x00\xff\u07ff",
760 "\x00\xff\u07ff",
761 "\x00\xff\u07ff",
762 "\x00\xff\u07ff\u0800",
763 "\x00\xff\u07ff\u0800",
764 "\x00\xff\u07ff\u0800",
765 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200766 "\x00\xff\u07ff\u0800\uffff",
767 "\x00\xff\u07ff\u0800\uffff",
768 "\x00\xff\u07ff\u0800\uffff",
769 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000770 ]
771 )
772
Walter Dörwald3abcb012007-04-16 22:10:50 +0000773 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000774 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000775 self.check_state_handling_decode(self.encoding,
776 u, u.encode(self.encoding))
777
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000778 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200779 super().test_lone_surrogates()
780 # not sure if this is making sense for
781 # UTF-16 and UTF-32
782 self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
Victor Stinner31be90b2010-04-22 19:38:16 +0000783 b'[\x80]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000784
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000785 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000786 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
787 b"abc\xed\xa0\x80def")
788 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
789 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200790 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
791 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
792 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
793 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000794 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700795 with self.assertRaises(UnicodeDecodeError):
796 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200797 with self.assertRaises(UnicodeDecodeError):
798 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000799
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200800@unittest.skipUnless(sys.platform == 'win32',
801 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200802class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200803 encoding = "cp65001"
804
805 def test_encode(self):
806 tests = [
807 ('abc', 'strict', b'abc'),
808 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
809 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
810 ]
811 if VISTA_OR_LATER:
812 tests.extend((
813 ('\udc80', 'strict', None),
814 ('\udc80', 'ignore', b''),
815 ('\udc80', 'replace', b'?'),
816 ('\udc80', 'backslashreplace', b'\\udc80'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200817 ('\udc80', 'namereplace', b'\\udc80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200818 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
819 ))
820 else:
821 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
822 for text, errors, expected in tests:
823 if expected is not None:
824 try:
825 encoded = text.encode('cp65001', errors)
826 except UnicodeEncodeError as err:
827 self.fail('Unable to encode %a to cp65001 with '
828 'errors=%r: %s' % (text, errors, err))
829 self.assertEqual(encoded, expected,
830 '%a.encode("cp65001", %r)=%a != %a'
831 % (text, errors, encoded, expected))
832 else:
833 self.assertRaises(UnicodeEncodeError,
834 text.encode, "cp65001", errors)
835
836 def test_decode(self):
837 tests = [
838 (b'abc', 'strict', 'abc'),
839 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
840 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
841 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
842 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
843 # invalid bytes
844 (b'[\xff]', 'strict', None),
845 (b'[\xff]', 'ignore', '[]'),
846 (b'[\xff]', 'replace', '[\ufffd]'),
847 (b'[\xff]', 'surrogateescape', '[\udcff]'),
848 ]
849 if VISTA_OR_LATER:
850 tests.extend((
851 (b'[\xed\xb2\x80]', 'strict', None),
852 (b'[\xed\xb2\x80]', 'ignore', '[]'),
853 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
854 ))
855 else:
856 tests.extend((
857 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
858 ))
859 for raw, errors, expected in tests:
860 if expected is not None:
861 try:
862 decoded = raw.decode('cp65001', errors)
863 except UnicodeDecodeError as err:
864 self.fail('Unable to decode %a from cp65001 with '
865 'errors=%r: %s' % (raw, errors, err))
866 self.assertEqual(decoded, expected,
867 '%a.decode("cp65001", %r)=%a != %a'
868 % (raw, errors, decoded, expected))
869 else:
870 self.assertRaises(UnicodeDecodeError,
871 raw.decode, 'cp65001', errors)
872
873 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
874 def test_lone_surrogates(self):
875 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
876 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
877 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
878 b'[\\udc80]')
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200879 self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
880 b'[\\udc80]')
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200881 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
882 b'[&#56448;]')
883 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
884 b'[\x80]')
885 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
886 b'[]')
887 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
888 b'[?]')
889
890 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
891 def test_surrogatepass_handler(self):
892 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
893 b"abc\xed\xa0\x80def")
894 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
895 "abc\ud800def")
896 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
897 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
898 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
899 "\U00010fff\uD800")
900 self.assertTrue(codecs.lookup_error("surrogatepass"))
901
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200902
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200903class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000904 encoding = "utf-7"
905
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000906 def test_partial(self):
907 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200908 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000909 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200910 'a',
911 'a',
912 'a+',
913 'a+-',
914 'a+-b',
915 'a+-b',
916 'a+-b',
917 'a+-b',
918 'a+-b',
919 'a+-b\x00',
920 'a+-b\x00c',
921 'a+-b\x00c',
922 'a+-b\x00c',
923 'a+-b\x00c',
924 'a+-b\x00c',
925 'a+-b\x00c\x80',
926 'a+-b\x00c\x80d',
927 'a+-b\x00c\x80d',
928 'a+-b\x00c\x80d',
929 'a+-b\x00c\x80d',
930 'a+-b\x00c\x80d',
931 'a+-b\x00c\x80d\u0100',
932 'a+-b\x00c\x80d\u0100e',
933 'a+-b\x00c\x80d\u0100e',
934 'a+-b\x00c\x80d\u0100e',
935 'a+-b\x00c\x80d\u0100e',
936 'a+-b\x00c\x80d\u0100e',
937 'a+-b\x00c\x80d\u0100e',
938 'a+-b\x00c\x80d\u0100e',
939 'a+-b\x00c\x80d\u0100e',
940 'a+-b\x00c\x80d\u0100e\U00010000',
941 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000942 ]
943 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000944
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300945 def test_errors(self):
946 tests = [
947 (b'a\xffb', 'a\ufffdb'),
948 (b'a+IK', 'a\ufffd'),
949 (b'a+IK-b', 'a\ufffdb'),
950 (b'a+IK,b', 'a\ufffdb'),
951 (b'a+IKx', 'a\u20ac\ufffd'),
952 (b'a+IKx-b', 'a\u20ac\ufffdb'),
953 (b'a+IKwgr', 'a\u20ac\ufffd'),
954 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
955 (b'a+IKwgr,', 'a\u20ac\ufffd'),
956 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
957 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
958 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
959 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
960 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
961 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
962 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
963 ]
964 for raw, expected in tests:
965 with self.subTest(raw=raw):
966 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
967 raw, 'strict', True)
968 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
969
970 def test_nonbmp(self):
971 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
972 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
973 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
974
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200975 test_lone_surrogates = None
976
977
Walter Dörwalde22d3392005-11-17 08:52:34 +0000978class UTF16ExTest(unittest.TestCase):
979
980 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000981 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000982
983 def test_bad_args(self):
984 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
985
986class ReadBufferTest(unittest.TestCase):
987
988 def test_array(self):
989 import array
990 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000991 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000992 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000993 )
994
995 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000996 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000997
998 def test_bad_args(self):
999 self.assertRaises(TypeError, codecs.readbuffer_encode)
1000 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1001
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001002class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001003 encoding = "utf-8-sig"
1004
1005 def test_partial(self):
1006 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001007 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001008 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001009 "",
1010 "",
1011 "", # First BOM has been read and skipped
1012 "",
1013 "",
1014 "\ufeff", # Second BOM has been read and emitted
1015 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001016 "\ufeff\x00", # First byte of encoded "\xff" read
1017 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1018 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1019 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001020 "\ufeff\x00\xff\u07ff",
1021 "\ufeff\x00\xff\u07ff",
1022 "\ufeff\x00\xff\u07ff\u0800",
1023 "\ufeff\x00\xff\u07ff\u0800",
1024 "\ufeff\x00\xff\u07ff\u0800",
1025 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001026 "\ufeff\x00\xff\u07ff\u0800\uffff",
1027 "\ufeff\x00\xff\u07ff\u0800\uffff",
1028 "\ufeff\x00\xff\u07ff\u0800\uffff",
1029 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001030 ]
1031 )
1032
Thomas Wouters89f507f2006-12-13 04:49:30 +00001033 def test_bug1601501(self):
1034 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001035 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001036
Walter Dörwald3abcb012007-04-16 22:10:50 +00001037 def test_bom(self):
1038 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001039 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001040 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1041
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001042 def test_stream_bom(self):
1043 unistring = "ABC\u00A1\u2200XYZ"
1044 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1045
1046 reader = codecs.getreader("utf-8-sig")
1047 for sizehint in [None] + list(range(1, 11)) + \
1048 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001049 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001050 ostream = io.StringIO()
1051 while 1:
1052 if sizehint is not None:
1053 data = istream.read(sizehint)
1054 else:
1055 data = istream.read()
1056
1057 if not data:
1058 break
1059 ostream.write(data)
1060
1061 got = ostream.getvalue()
1062 self.assertEqual(got, unistring)
1063
1064 def test_stream_bare(self):
1065 unistring = "ABC\u00A1\u2200XYZ"
1066 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1067
1068 reader = codecs.getreader("utf-8-sig")
1069 for sizehint in [None] + list(range(1, 11)) + \
1070 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001071 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001072 ostream = io.StringIO()
1073 while 1:
1074 if sizehint is not None:
1075 data = istream.read(sizehint)
1076 else:
1077 data = istream.read()
1078
1079 if not data:
1080 break
1081 ostream.write(data)
1082
1083 got = ostream.getvalue()
1084 self.assertEqual(got, unistring)
1085
1086class EscapeDecodeTest(unittest.TestCase):
1087 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001088 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001089 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001090
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001091 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001092 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001093 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001094 b = bytes([b])
1095 if b != b'\\':
1096 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001097
1098 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001099 decode = codecs.escape_decode
1100 check = coding_checker(self, decode)
1101 check(b"[\\\n]", b"[]")
1102 check(br'[\"]', b'["]')
1103 check(br"[\']", b"[']")
1104 check(br"[\\]", br"[\]")
1105 check(br"[\a]", b"[\x07]")
1106 check(br"[\b]", b"[\x08]")
1107 check(br"[\t]", b"[\x09]")
1108 check(br"[\n]", b"[\x0a]")
1109 check(br"[\v]", b"[\x0b]")
1110 check(br"[\f]", b"[\x0c]")
1111 check(br"[\r]", b"[\x0d]")
1112 check(br"[\7]", b"[\x07]")
1113 check(br"[\8]", br"[\8]")
1114 check(br"[\78]", b"[\x078]")
1115 check(br"[\41]", b"[!]")
1116 check(br"[\418]", b"[!8]")
1117 check(br"[\101]", b"[A]")
1118 check(br"[\1010]", b"[A0]")
1119 check(br"[\501]", b"[A]")
1120 check(br"[\x41]", b"[A]")
1121 check(br"[\X41]", br"[\X41]")
1122 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001123 for b in range(256):
1124 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001125 b = bytes([b])
1126 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001127
1128 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001129 decode = codecs.escape_decode
1130 self.assertRaises(ValueError, decode, br"\x")
1131 self.assertRaises(ValueError, decode, br"[\x]")
1132 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1133 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1134 self.assertRaises(ValueError, decode, br"\x0")
1135 self.assertRaises(ValueError, decode, br"[\x0]")
1136 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1137 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001138
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001139class RecodingTest(unittest.TestCase):
1140 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001141 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001142 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001143 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001144 f2.close()
1145 # Python used to crash on this at exit because of a refcount
1146 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001147
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001148 self.assertTrue(f.closed)
1149
Martin v. Löwis2548c732003-04-18 10:39:54 +00001150# From RFC 3492
1151punycode_testcases = [
1152 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001153 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1154 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001155 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001156 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001157 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001158 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001159 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001160 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001161 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001162 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001163 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1164 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1165 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001166 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001167 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001168 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1169 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1170 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001171 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001172 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001173 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001174 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1175 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1176 "\u0939\u0948\u0902",
1177 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001178
1179 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001180 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001181 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1182 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001183
1184 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001185 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1186 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1187 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001188 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1189 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001190
1191 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001192 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1193 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1194 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1195 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001196 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001197
1198 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001199 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1200 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1201 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1202 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1203 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001204 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001205
1206 # (K) Vietnamese:
1207 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1208 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001209 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1210 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1211 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1212 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001213 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001214
Martin v. Löwis2548c732003-04-18 10:39:54 +00001215 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001216 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001217 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001218
Martin v. Löwis2548c732003-04-18 10:39:54 +00001219 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001220 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1221 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1222 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001223 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001224
1225 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001226 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1227 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1228 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001229 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001230
1231 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001232 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001233 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001234
1235 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001236 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1237 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001238 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001239
1240 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001241 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001242 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001243
1244 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001245 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001246 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001247
1248 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001249 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1250 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001251 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001252 ]
1253
1254for i in punycode_testcases:
1255 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001256 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001257
1258class PunycodeTest(unittest.TestCase):
1259 def test_encode(self):
1260 for uni, puny in punycode_testcases:
1261 # Need to convert both strings to lower case, since
1262 # some of the extended encodings use upper case, but our
1263 # code produces only lower case. Converting just puny to
1264 # lower is also insufficient, since some of the input characters
1265 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001266 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001267 str(uni.encode("punycode"), "ascii").lower(),
1268 str(puny, "ascii").lower()
1269 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001270
1271 def test_decode(self):
1272 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001273 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001274 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001275 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001276
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001277class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001278 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001279 def test_bug1251300(self):
1280 # Decoding with unicode_internal used to not correctly handle "code
1281 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001282 ok = [
1283 (b"\x00\x10\xff\xff", "\U0010ffff"),
1284 (b"\x00\x00\x01\x01", "\U00000101"),
1285 (b"", ""),
1286 ]
1287 not_ok = [
1288 b"\x7f\xff\xff\xff",
1289 b"\x80\x00\x00\x00",
1290 b"\x81\x00\x00\x00",
1291 b"\x00",
1292 b"\x00\x00\x00\x00\x00",
1293 ]
1294 for internal, uni in ok:
1295 if sys.byteorder == "little":
1296 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001297 with support.check_warnings():
1298 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001299 for internal in not_ok:
1300 if sys.byteorder == "little":
1301 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001302 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001303 'deprecated', DeprecationWarning)):
1304 self.assertRaises(UnicodeDecodeError, internal.decode,
1305 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001306 if sys.byteorder == "little":
1307 invalid = b"\x00\x00\x11\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001308 invalid_backslashreplace = r"\x00\x00\x11\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001309 else:
1310 invalid = b"\x00\x11\x00\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001311 invalid_backslashreplace = r"\x00\x11\x00\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001312 with support.check_warnings():
1313 self.assertRaises(UnicodeDecodeError,
1314 invalid.decode, "unicode_internal")
1315 with support.check_warnings():
1316 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1317 '\ufffd')
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001318 with support.check_warnings():
1319 self.assertEqual(invalid.decode("unicode_internal", "backslashreplace"),
1320 invalid_backslashreplace)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001321
Victor Stinner182d90d2011-09-29 19:53:55 +02001322 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001323 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001324 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001325 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001326 'deprecated', DeprecationWarning)):
1327 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001328 except UnicodeDecodeError as ex:
1329 self.assertEqual("unicode_internal", ex.encoding)
1330 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1331 self.assertEqual(4, ex.start)
1332 self.assertEqual(8, ex.end)
1333 else:
1334 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001335
Victor Stinner182d90d2011-09-29 19:53:55 +02001336 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001337 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001338 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1339 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001340 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001341 'deprecated', DeprecationWarning)):
1342 ab = "ab".encode("unicode_internal").decode()
1343 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1344 "ascii"),
1345 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001346 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001347
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001348 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001349 with support.check_warnings(('unicode_internal codec has been '
1350 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001351 # Issue 3739
1352 encoder = codecs.getencoder("unicode_internal")
1353 self.assertEqual(encoder("a")[1], 1)
1354 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1355
1356 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001357
Martin v. Löwis2548c732003-04-18 10:39:54 +00001358# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1359nameprep_tests = [
1360 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001361 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1362 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1363 b'\xb8\x8f\xef\xbb\xbf',
1364 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001365 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001366 (b'CAFE',
1367 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001368 # 3.3 Case folding 8bit U+00DF (german sharp s).
1369 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001370 (b'\xc3\x9f',
1371 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001372 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001373 (b'\xc4\xb0',
1374 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001375 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001376 (b'\xc5\x83\xcd\xba',
1377 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001378 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1379 # XXX: skip this as it fails in UCS-2 mode
1380 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1381 # 'telc\xe2\x88\x95kg\xcf\x83'),
1382 (None, None),
1383 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001384 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1385 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001386 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001387 (b'\xe1\xbe\xb7',
1388 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001389 # 3.9 Self-reverting case folding U+01F0 and normalization.
1390 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001391 (b'\xc7\xb0',
1392 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001393 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001394 (b'\xce\x90',
1395 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001396 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001397 (b'\xce\xb0',
1398 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001399 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001400 (b'\xe1\xba\x96',
1401 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001402 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001403 (b'\xe1\xbd\x96',
1404 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001405 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001406 (b' ',
1407 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001408 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001409 (b'\xc2\xa0',
1410 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001411 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001412 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001413 None),
1414 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001415 (b'\xe2\x80\x80',
1416 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001417 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001418 (b'\xe2\x80\x8b',
1419 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001420 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001421 (b'\xe3\x80\x80',
1422 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001423 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001424 (b'\x10\x7f',
1425 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001426 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001427 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001428 None),
1429 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001430 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001431 None),
1432 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001433 (b'\xef\xbb\xbf',
1434 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001435 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001436 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001437 None),
1438 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001439 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001440 None),
1441 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001442 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001443 None),
1444 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001445 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001446 None),
1447 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001448 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001449 None),
1450 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001451 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001452 None),
1453 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001454 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001455 None),
1456 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001457 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001458 None),
1459 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001460 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001461 None),
1462 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001463 (b'\xcd\x81',
1464 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001465 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001466 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001467 None),
1468 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001469 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001470 None),
1471 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001472 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001473 None),
1474 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001475 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001476 None),
1477 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001478 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001479 None),
1480 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001481 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001482 None),
1483 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001484 (b'foo\xef\xb9\xb6bar',
1485 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001486 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001487 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001488 None),
1489 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001490 (b'\xd8\xa71\xd8\xa8',
1491 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001492 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001493 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001494 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001495 # None),
1496 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001497 # 3.44 Larger test (shrinking).
1498 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001499 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1500 b'\xaa\xce\xb0\xe2\x80\x80',
1501 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001502 # 3.45 Larger test (expanding).
1503 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001504 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1505 b'\x80',
1506 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1507 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1508 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001509 ]
1510
1511
1512class NameprepTest(unittest.TestCase):
1513 def test_nameprep(self):
1514 from encodings.idna import nameprep
1515 for pos, (orig, prepped) in enumerate(nameprep_tests):
1516 if orig is None:
1517 # Skipped
1518 continue
1519 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001520 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001521 if prepped is None:
1522 # Input contains prohibited characters
1523 self.assertRaises(UnicodeError, nameprep, orig)
1524 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001525 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001526 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001527 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001528 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001529 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001530
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001531class IDNACodecTest(unittest.TestCase):
1532 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001533 self.assertEqual(str(b"python.org", "idna"), "python.org")
1534 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1535 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1536 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001537
1538 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001539 self.assertEqual("python.org".encode("idna"), b"python.org")
1540 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1541 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1542 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001543
Martin v. Löwis8b595142005-08-25 11:03:38 +00001544 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001545 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001546 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001547 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001548
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001549 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001550 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001551 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001552 "python.org"
1553 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001554 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001555 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001556 "python.org."
1557 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001558 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001559 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001560 "pyth\xf6n.org."
1561 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001562 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001563 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001564 "pyth\xf6n.org."
1565 )
1566
1567 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001568 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1569 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1570 self.assertEqual(decoder.decode(b"rg"), "")
1571 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001572
1573 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001574 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1575 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1576 self.assertEqual(decoder.decode(b"rg."), "org.")
1577 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001578
1579 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001580 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001581 b"".join(codecs.iterencode("python.org", "idna")),
1582 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001583 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001584 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001585 b"".join(codecs.iterencode("python.org.", "idna")),
1586 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001587 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001588 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001589 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1590 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001591 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001592 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001593 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1594 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001595 )
1596
1597 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001598 self.assertEqual(encoder.encode("\xe4x"), b"")
1599 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1600 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001601
1602 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001603 self.assertEqual(encoder.encode("\xe4x"), b"")
1604 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1605 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001606
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001607 def test_errors(self):
1608 """Only supports "strict" error handler"""
1609 "python.org".encode("idna", "strict")
1610 b"python.org".decode("idna", "strict")
1611 for errors in ("ignore", "replace", "backslashreplace",
1612 "surrogateescape"):
1613 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1614 self.assertRaises(Exception,
1615 b"python.org".decode, "idna", errors)
1616
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001617class CodecsModuleTest(unittest.TestCase):
1618
1619 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001620 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1621 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001622 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001623 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001624 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001625
Victor Stinnera57dfd02014-05-14 17:13:14 +02001626 # test keywords
1627 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1628 '\xe4\xf6\xfc')
1629 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1630 '[]')
1631
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001632 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001633 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1634 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001635 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001636 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001637 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001638 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001639
Victor Stinnera57dfd02014-05-14 17:13:14 +02001640 # test keywords
1641 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1642 b'\xe4\xf6\xfc')
1643 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1644 b'[]')
1645
Walter Dörwald063e1e82004-10-28 13:04:26 +00001646 def test_register(self):
1647 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001648 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001649
1650 def test_lookup(self):
1651 self.assertRaises(TypeError, codecs.lookup)
1652 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001653 self.assertRaises(LookupError, codecs.lookup, " ")
1654
1655 def test_getencoder(self):
1656 self.assertRaises(TypeError, codecs.getencoder)
1657 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1658
1659 def test_getdecoder(self):
1660 self.assertRaises(TypeError, codecs.getdecoder)
1661 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1662
1663 def test_getreader(self):
1664 self.assertRaises(TypeError, codecs.getreader)
1665 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1666
1667 def test_getwriter(self):
1668 self.assertRaises(TypeError, codecs.getwriter)
1669 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001670
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001671 def test_lookup_issue1813(self):
1672 # Issue #1813: under Turkish locales, lookup of some codecs failed
1673 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001674 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001675 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1676 try:
1677 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1678 except locale.Error:
1679 # Unsupported locale on this system
1680 self.skipTest('test needs Turkish locale')
1681 c = codecs.lookup('ASCII')
1682 self.assertEqual(c.name, 'ascii')
1683
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001684 def test_all(self):
1685 api = (
1686 "encode", "decode",
1687 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1688 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1689 "getencoder", "getdecoder", "getincrementalencoder",
1690 "getincrementaldecoder", "getreader", "getwriter",
1691 "register_error", "lookup_error",
1692 "strict_errors", "replace_errors", "ignore_errors",
1693 "xmlcharrefreplace_errors", "backslashreplace_errors",
1694 "namereplace_errors",
1695 "open", "EncodedFile",
1696 "iterencode", "iterdecode",
1697 "BOM", "BOM_BE", "BOM_LE",
1698 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1699 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1700 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1701 "StreamReaderWriter", "StreamRecoder",
1702 )
1703 self.assertCountEqual(api, codecs.__all__)
1704 for api in codecs.__all__:
1705 getattr(codecs, api)
1706
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001707 def test_open(self):
1708 self.addCleanup(support.unlink, support.TESTFN)
1709 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1710 with self.subTest(mode), \
1711 codecs.open(support.TESTFN, mode, 'ascii') as file:
1712 self.assertIsInstance(file, codecs.StreamReaderWriter)
1713
1714 def test_undefined(self):
1715 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1716 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1717 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1718 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1719 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1720 self.assertRaises(UnicodeError,
1721 codecs.encode, 'abc', 'undefined', errors)
1722 self.assertRaises(UnicodeError,
1723 codecs.decode, b'abc', 'undefined', errors)
1724
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001725class StreamReaderTest(unittest.TestCase):
1726
1727 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001728 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001729 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001730
1731 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001732 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001733 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001734
Thomas Wouters89f507f2006-12-13 04:49:30 +00001735class EncodedFileTest(unittest.TestCase):
1736
1737 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001738 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001739 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001740 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001741
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001742 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001743 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001744 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001745 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001746
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001747all_unicode_encodings = [
1748 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001749 "big5",
1750 "big5hkscs",
1751 "charmap",
1752 "cp037",
1753 "cp1006",
1754 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001755 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001756 "cp1140",
1757 "cp1250",
1758 "cp1251",
1759 "cp1252",
1760 "cp1253",
1761 "cp1254",
1762 "cp1255",
1763 "cp1256",
1764 "cp1257",
1765 "cp1258",
1766 "cp424",
1767 "cp437",
1768 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001769 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001770 "cp737",
1771 "cp775",
1772 "cp850",
1773 "cp852",
1774 "cp855",
1775 "cp856",
1776 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001777 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001778 "cp860",
1779 "cp861",
1780 "cp862",
1781 "cp863",
1782 "cp864",
1783 "cp865",
1784 "cp866",
1785 "cp869",
1786 "cp874",
1787 "cp875",
1788 "cp932",
1789 "cp949",
1790 "cp950",
1791 "euc_jis_2004",
1792 "euc_jisx0213",
1793 "euc_jp",
1794 "euc_kr",
1795 "gb18030",
1796 "gb2312",
1797 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001798 "hp_roman8",
1799 "hz",
1800 "idna",
1801 "iso2022_jp",
1802 "iso2022_jp_1",
1803 "iso2022_jp_2",
1804 "iso2022_jp_2004",
1805 "iso2022_jp_3",
1806 "iso2022_jp_ext",
1807 "iso2022_kr",
1808 "iso8859_1",
1809 "iso8859_10",
1810 "iso8859_11",
1811 "iso8859_13",
1812 "iso8859_14",
1813 "iso8859_15",
1814 "iso8859_16",
1815 "iso8859_2",
1816 "iso8859_3",
1817 "iso8859_4",
1818 "iso8859_5",
1819 "iso8859_6",
1820 "iso8859_7",
1821 "iso8859_8",
1822 "iso8859_9",
1823 "johab",
1824 "koi8_r",
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03001825 "koi8_t",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001826 "koi8_u",
Serhiy Storchakaad8a1c32015-05-12 23:16:55 +03001827 "kz1048",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001828 "latin_1",
1829 "mac_cyrillic",
1830 "mac_greek",
1831 "mac_iceland",
1832 "mac_latin2",
1833 "mac_roman",
1834 "mac_turkish",
1835 "palmos",
1836 "ptcp154",
1837 "punycode",
1838 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001839 "shift_jis",
1840 "shift_jis_2004",
1841 "shift_jisx0213",
1842 "tis_620",
1843 "unicode_escape",
1844 "unicode_internal",
1845 "utf_16",
1846 "utf_16_be",
1847 "utf_16_le",
1848 "utf_7",
1849 "utf_8",
1850]
1851
1852if hasattr(codecs, "mbcs_encode"):
1853 all_unicode_encodings.append("mbcs")
1854
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001855# The following encoding is not tested, because it's not supposed
1856# to work:
1857# "undefined"
1858
1859# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001860broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001861 "punycode",
1862 "unicode_internal"
1863]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001864
Walter Dörwald3abcb012007-04-16 22:10:50 +00001865class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001866 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001867 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001868 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001869 name = codecs.lookup(encoding).name
1870 if encoding.endswith("_codec"):
1871 name += "_codec"
1872 elif encoding == "latin_1":
1873 name = "latin_1"
1874 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001875
Ezio Melottiadc417c2011-11-17 12:23:34 +02001876 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001877 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001878 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001879 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001880 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001881 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001882
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001883 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001884 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001885 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001886 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001887 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001888 for c in s:
1889 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001890 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001891 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001892 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001893 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001894 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001895 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001896 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001897 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001898 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001899 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001900
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001901 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001902 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001903 try:
1904 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001905 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001906 pass
1907 else:
1908 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001909 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001910 for c in s:
1911 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001912 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001913 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001914 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001915 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001916 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001917 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001918 self.assertEqual(decodedresult, s,
1919 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001920
1921 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001922 result = "".join(codecs.iterdecode(
1923 codecs.iterencode(s, encoding), encoding))
1924 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001925
1926 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001927 result = "".join(codecs.iterdecode(
1928 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001929 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001930
Victor Stinner554f3f02010-06-16 23:33:54 +00001931 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001932 # check incremental decoder/encoder with errors argument
1933 try:
1934 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001935 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001936 pass
1937 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001938 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001939 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001940 decodedresult = "".join(decoder.decode(bytes([c]))
1941 for c in encodedresult)
1942 self.assertEqual(decodedresult, s,
1943 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001944
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001945 @support.cpython_only
1946 def test_basics_capi(self):
1947 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
1948 s = "abc123" # all codecs should be able to encode these
1949 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001950 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001951 # check incremental decoder/encoder (fetched via the C API)
1952 try:
1953 cencoder = codec_incrementalencoder(encoding)
1954 except LookupError: # no IncrementalEncoder
1955 pass
1956 else:
1957 # check C API
1958 encodedresult = b""
1959 for c in s:
1960 encodedresult += cencoder.encode(c)
1961 encodedresult += cencoder.encode("", True)
1962 cdecoder = codec_incrementaldecoder(encoding)
1963 decodedresult = ""
1964 for c in encodedresult:
1965 decodedresult += cdecoder.decode(bytes([c]))
1966 decodedresult += cdecoder.decode(b"", True)
1967 self.assertEqual(decodedresult, s,
1968 "encoding=%r" % encoding)
1969
1970 if encoding not in ("idna", "mbcs"):
1971 # check incremental decoder/encoder with errors argument
1972 try:
1973 cencoder = codec_incrementalencoder(encoding, "ignore")
1974 except LookupError: # no IncrementalEncoder
1975 pass
1976 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001977 encodedresult = b"".join(cencoder.encode(c) for c in s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001978 cdecoder = codec_incrementaldecoder(encoding, "ignore")
1979 decodedresult = "".join(cdecoder.decode(bytes([c]))
1980 for c in encodedresult)
1981 self.assertEqual(decodedresult, s,
1982 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001983
Walter Dörwald729c31f2005-03-14 19:06:30 +00001984 def test_seek(self):
1985 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001986 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001987 for encoding in all_unicode_encodings:
1988 if encoding == "idna": # FIXME: See SF bug #1163178
1989 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001990 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00001991 continue
Victor Stinner05010702011-05-27 16:50:40 +02001992 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001993 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001994 # Test that calling seek resets the internal codec state and buffers
1995 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001996 data = reader.read()
1997 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001998
Walter Dörwalde22d3392005-11-17 08:52:34 +00001999 def test_bad_decode_args(self):
2000 for encoding in all_unicode_encodings:
2001 decoder = codecs.getdecoder(encoding)
2002 self.assertRaises(TypeError, decoder)
2003 if encoding not in ("idna", "punycode"):
2004 self.assertRaises(TypeError, decoder, 42)
2005
2006 def test_bad_encode_args(self):
2007 for encoding in all_unicode_encodings:
2008 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02002009 with support.check_warnings():
2010 # unicode-internal has been deprecated
2011 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002012
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002013 def test_encoding_map_type_initialized(self):
2014 from encodings import cp1140
2015 # This used to crash, we are only verifying there's no crash.
2016 table_type = type(cp1140.encoding_table)
2017 self.assertEqual(table_type, table_type)
2018
Walter Dörwald3abcb012007-04-16 22:10:50 +00002019 def test_decoder_state(self):
2020 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002021 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002022 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002023 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002024 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2025 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2026
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002027class CharmapTest(unittest.TestCase):
2028 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002029 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002030 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002031 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002032 )
2033
Ezio Melottib3aedd42010-11-20 19:04:17 +00002034 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002035 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2036 ("\U0010FFFFbc", 3)
2037 )
2038
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002039 self.assertRaises(UnicodeDecodeError,
2040 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2041 )
2042
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002043 self.assertRaises(UnicodeDecodeError,
2044 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2045 )
2046
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002047 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002048 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002049 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002050 )
2051
Ezio Melottib3aedd42010-11-20 19:04:17 +00002052 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002053 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002054 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002055 )
2056
Ezio Melottib3aedd42010-11-20 19:04:17 +00002057 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002058 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2059 ("ab\\x02", 3)
2060 )
2061
2062 self.assertEqual(
2063 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2064 ("ab\\x02", 3)
2065 )
2066
2067 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002068 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002069 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002070 )
2071
Ezio Melottib3aedd42010-11-20 19:04:17 +00002072 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002073 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002074 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002075 )
2076
Guido van Rossum805365e2007-05-07 22:24:25 +00002077 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002078 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002079 codecs.charmap_decode(allbytes, "ignore", ""),
2080 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002081 )
2082
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002083 def test_decode_with_int2str_map(self):
2084 self.assertEqual(
2085 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2086 {0: 'a', 1: 'b', 2: 'c'}),
2087 ("abc", 3)
2088 )
2089
2090 self.assertEqual(
2091 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2092 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2093 ("AaBbCc", 3)
2094 )
2095
2096 self.assertEqual(
2097 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2098 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2099 ("\U0010FFFFbc", 3)
2100 )
2101
2102 self.assertEqual(
2103 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2104 {0: 'a', 1: 'b', 2: ''}),
2105 ("ab", 3)
2106 )
2107
2108 self.assertRaises(UnicodeDecodeError,
2109 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2110 {0: 'a', 1: 'b'}
2111 )
2112
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002113 self.assertRaises(UnicodeDecodeError,
2114 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2115 {0: 'a', 1: 'b', 2: None}
2116 )
2117
2118 # Issue #14850
2119 self.assertRaises(UnicodeDecodeError,
2120 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2121 {0: 'a', 1: 'b', 2: '\ufffe'}
2122 )
2123
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002124 self.assertEqual(
2125 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2126 {0: 'a', 1: 'b'}),
2127 ("ab\ufffd", 3)
2128 )
2129
2130 self.assertEqual(
2131 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2132 {0: 'a', 1: 'b', 2: None}),
2133 ("ab\ufffd", 3)
2134 )
2135
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002136 # Issue #14850
2137 self.assertEqual(
2138 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2139 {0: 'a', 1: 'b', 2: '\ufffe'}),
2140 ("ab\ufffd", 3)
2141 )
2142
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002143 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002144 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2145 {0: 'a', 1: 'b'}),
2146 ("ab\\x02", 3)
2147 )
2148
2149 self.assertEqual(
2150 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2151 {0: 'a', 1: 'b', 2: None}),
2152 ("ab\\x02", 3)
2153 )
2154
2155 # Issue #14850
2156 self.assertEqual(
2157 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2158 {0: 'a', 1: 'b', 2: '\ufffe'}),
2159 ("ab\\x02", 3)
2160 )
2161
2162 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002163 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2164 {0: 'a', 1: 'b'}),
2165 ("ab", 3)
2166 )
2167
2168 self.assertEqual(
2169 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2170 {0: 'a', 1: 'b', 2: None}),
2171 ("ab", 3)
2172 )
2173
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002174 # Issue #14850
2175 self.assertEqual(
2176 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2177 {0: 'a', 1: 'b', 2: '\ufffe'}),
2178 ("ab", 3)
2179 )
2180
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002181 allbytes = bytes(range(256))
2182 self.assertEqual(
2183 codecs.charmap_decode(allbytes, "ignore", {}),
2184 ("", len(allbytes))
2185 )
2186
2187 def test_decode_with_int2int_map(self):
2188 a = ord('a')
2189 b = ord('b')
2190 c = ord('c')
2191
2192 self.assertEqual(
2193 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2194 {0: a, 1: b, 2: c}),
2195 ("abc", 3)
2196 )
2197
2198 # Issue #15379
2199 self.assertEqual(
2200 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2201 {0: 0x10FFFF, 1: b, 2: c}),
2202 ("\U0010FFFFbc", 3)
2203 )
2204
Antoine Pitroua1f76552012-09-23 20:00:04 +02002205 self.assertEqual(
2206 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2207 {0: sys.maxunicode, 1: b, 2: c}),
2208 (chr(sys.maxunicode) + "bc", 3)
2209 )
2210
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002211 self.assertRaises(TypeError,
2212 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002213 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002214 )
2215
2216 self.assertRaises(UnicodeDecodeError,
2217 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2218 {0: a, 1: b},
2219 )
2220
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002221 self.assertRaises(UnicodeDecodeError,
2222 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2223 {0: a, 1: b, 2: 0xFFFE},
2224 )
2225
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002226 self.assertEqual(
2227 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2228 {0: a, 1: b}),
2229 ("ab\ufffd", 3)
2230 )
2231
2232 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002233 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2234 {0: a, 1: b, 2: 0xFFFE}),
2235 ("ab\ufffd", 3)
2236 )
2237
2238 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002239 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2240 {0: a, 1: b}),
2241 ("ab\\x02", 3)
2242 )
2243
2244 self.assertEqual(
2245 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2246 {0: a, 1: b, 2: 0xFFFE}),
2247 ("ab\\x02", 3)
2248 )
2249
2250 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002251 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2252 {0: a, 1: b}),
2253 ("ab", 3)
2254 )
2255
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002256 self.assertEqual(
2257 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2258 {0: a, 1: b, 2: 0xFFFE}),
2259 ("ab", 3)
2260 )
2261
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002262
Thomas Wouters89f507f2006-12-13 04:49:30 +00002263class WithStmtTest(unittest.TestCase):
2264 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002265 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002266 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2267 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002268 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002269
2270 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002271 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002272 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002273 with codecs.StreamReaderWriter(f, info.streamreader,
2274 info.streamwriter, 'strict') as srw:
2275 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002276
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002277class TypesTest(unittest.TestCase):
2278 def test_decode_unicode(self):
2279 # Most decoders don't accept unicode input
2280 decoders = [
2281 codecs.utf_7_decode,
2282 codecs.utf_8_decode,
2283 codecs.utf_16_le_decode,
2284 codecs.utf_16_be_decode,
2285 codecs.utf_16_ex_decode,
2286 codecs.utf_32_decode,
2287 codecs.utf_32_le_decode,
2288 codecs.utf_32_be_decode,
2289 codecs.utf_32_ex_decode,
2290 codecs.latin_1_decode,
2291 codecs.ascii_decode,
2292 codecs.charmap_decode,
2293 ]
2294 if hasattr(codecs, "mbcs_decode"):
2295 decoders.append(codecs.mbcs_decode)
2296 for decoder in decoders:
2297 self.assertRaises(TypeError, decoder, "xxx")
2298
2299 def test_unicode_escape(self):
2300 # Escape-decoding an unicode string is supported ang gives the same
2301 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002302 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2303 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2304 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2305 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002306
Victor Stinnere3b47152011-12-09 20:49:49 +01002307 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2308 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002309 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2310 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002311
2312 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2313 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002314 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2315 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002316
Serhiy Storchakad6793772013-01-29 10:20:44 +02002317
2318class UnicodeEscapeTest(unittest.TestCase):
2319 def test_empty(self):
2320 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2321 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2322
2323 def test_raw_encode(self):
2324 encode = codecs.unicode_escape_encode
2325 for b in range(32, 127):
2326 if b != b'\\'[0]:
2327 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2328
2329 def test_raw_decode(self):
2330 decode = codecs.unicode_escape_decode
2331 for b in range(256):
2332 if b != b'\\'[0]:
2333 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2334
2335 def test_escape_encode(self):
2336 encode = codecs.unicode_escape_encode
2337 check = coding_checker(self, encode)
2338 check('\t', br'\t')
2339 check('\n', br'\n')
2340 check('\r', br'\r')
2341 check('\\', br'\\')
2342 for b in range(32):
2343 if chr(b) not in '\t\n\r':
2344 check(chr(b), ('\\x%02x' % b).encode())
2345 for b in range(127, 256):
2346 check(chr(b), ('\\x%02x' % b).encode())
2347 check('\u20ac', br'\u20ac')
2348 check('\U0001d120', br'\U0001d120')
2349
2350 def test_escape_decode(self):
2351 decode = codecs.unicode_escape_decode
2352 check = coding_checker(self, decode)
2353 check(b"[\\\n]", "[]")
2354 check(br'[\"]', '["]')
2355 check(br"[\']", "[']")
2356 check(br"[\\]", r"[\]")
2357 check(br"[\a]", "[\x07]")
2358 check(br"[\b]", "[\x08]")
2359 check(br"[\t]", "[\x09]")
2360 check(br"[\n]", "[\x0a]")
2361 check(br"[\v]", "[\x0b]")
2362 check(br"[\f]", "[\x0c]")
2363 check(br"[\r]", "[\x0d]")
2364 check(br"[\7]", "[\x07]")
2365 check(br"[\8]", r"[\8]")
2366 check(br"[\78]", "[\x078]")
2367 check(br"[\41]", "[!]")
2368 check(br"[\418]", "[!8]")
2369 check(br"[\101]", "[A]")
2370 check(br"[\1010]", "[A0]")
2371 check(br"[\x41]", "[A]")
2372 check(br"[\x410]", "[A0]")
2373 check(br"\u20ac", "\u20ac")
2374 check(br"\U0001d120", "\U0001d120")
2375 for b in range(256):
2376 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2377 check(b'\\' + bytes([b]), '\\' + chr(b))
2378
2379 def test_decode_errors(self):
2380 decode = codecs.unicode_escape_decode
2381 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2382 for i in range(d):
2383 self.assertRaises(UnicodeDecodeError, decode,
2384 b"\\" + c + b"0"*i)
2385 self.assertRaises(UnicodeDecodeError, decode,
2386 b"[\\" + c + b"0"*i + b"]")
2387 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2388 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2389 self.assertEqual(decode(data, "replace"),
2390 ("[\ufffd]\ufffd", len(data)))
2391 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2392 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2393 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2394
2395
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002396class RawUnicodeEscapeTest(unittest.TestCase):
2397 def test_empty(self):
2398 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2399 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2400
2401 def test_raw_encode(self):
2402 encode = codecs.raw_unicode_escape_encode
2403 for b in range(256):
2404 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2405
2406 def test_raw_decode(self):
2407 decode = codecs.raw_unicode_escape_decode
2408 for b in range(256):
2409 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2410
2411 def test_escape_encode(self):
2412 encode = codecs.raw_unicode_escape_encode
2413 check = coding_checker(self, encode)
2414 for b in range(256):
2415 if b not in b'uU':
2416 check('\\' + chr(b), b'\\' + bytes([b]))
2417 check('\u20ac', br'\u20ac')
2418 check('\U0001d120', br'\U0001d120')
2419
2420 def test_escape_decode(self):
2421 decode = codecs.raw_unicode_escape_decode
2422 check = coding_checker(self, decode)
2423 for b in range(256):
2424 if b not in b'uU':
2425 check(b'\\' + bytes([b]), '\\' + chr(b))
2426 check(br"\u20ac", "\u20ac")
2427 check(br"\U0001d120", "\U0001d120")
2428
2429 def test_decode_errors(self):
2430 decode = codecs.raw_unicode_escape_decode
2431 for c, d in (b'u', 4), (b'U', 4):
2432 for i in range(d):
2433 self.assertRaises(UnicodeDecodeError, decode,
2434 b"\\" + c + b"0"*i)
2435 self.assertRaises(UnicodeDecodeError, decode,
2436 b"[\\" + c + b"0"*i + b"]")
2437 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2438 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2439 self.assertEqual(decode(data, "replace"),
2440 ("[\ufffd]\ufffd", len(data)))
2441 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2442 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2443 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2444
2445
Martin v. Löwis43c57782009-05-10 08:15:24 +00002446class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002447
2448 def test_utf8(self):
2449 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002450 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002451 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002452 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002453 b"foo\x80bar")
2454 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002455 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002456 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002457 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002458 b"\xed\xb0\x80")
2459
2460 def test_ascii(self):
2461 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002462 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002463 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002464 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002465 b"foo\x80bar")
2466
2467 def test_charmap(self):
2468 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002469 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002470 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002471 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002472 b"foo\xa5bar")
2473
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002474 def test_latin1(self):
2475 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002476 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002477 b"\xe4\xeb\xef\xf6\xfc")
2478
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002479
Victor Stinner3fed0872010-05-22 02:16:27 +00002480class BomTest(unittest.TestCase):
2481 def test_seek0(self):
2482 data = "1234567890"
2483 tests = ("utf-16",
2484 "utf-16-le",
2485 "utf-16-be",
2486 "utf-32",
2487 "utf-32-le",
2488 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002489 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002490 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002491 # Check if the BOM is written only once
2492 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002493 f.write(data)
2494 f.write(data)
2495 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002496 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002497 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002498 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002499
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002500 # Check that the BOM is written after a seek(0)
2501 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2502 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002503 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002504 f.seek(0)
2505 f.write(data)
2506 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002507 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002508
2509 # (StreamWriter) Check that the BOM is written after a seek(0)
2510 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002511 f.writer.write(data[0])
2512 self.assertNotEqual(f.writer.tell(), 0)
2513 f.writer.seek(0)
2514 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002515 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002516 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002517
Victor Stinner05010702011-05-27 16:50:40 +02002518 # Check that the BOM is not written after a seek() at a position
2519 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002520 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2521 f.write(data)
2522 f.seek(f.tell())
2523 f.write(data)
2524 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002525 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002526
Victor Stinner05010702011-05-27 16:50:40 +02002527 # (StreamWriter) Check that the BOM is not written after a seek()
2528 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002529 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002530 f.writer.write(data)
2531 f.writer.seek(f.writer.tell())
2532 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002533 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002534 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002535
Victor Stinner3fed0872010-05-22 02:16:27 +00002536
Georg Brandl02524622010-12-02 18:06:51 +00002537bytes_transform_encodings = [
2538 "base64_codec",
2539 "uu_codec",
2540 "quopri_codec",
2541 "hex_codec",
2542]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002543
2544transform_aliases = {
2545 "base64_codec": ["base64", "base_64"],
2546 "uu_codec": ["uu"],
2547 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2548 "hex_codec": ["hex"],
2549 "rot_13": ["rot13"],
2550}
2551
Georg Brandl02524622010-12-02 18:06:51 +00002552try:
2553 import zlib
2554except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002555 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002556else:
2557 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002558 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002559try:
2560 import bz2
2561except ImportError:
2562 pass
2563else:
2564 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002565 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002566
2567class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002568
Georg Brandl02524622010-12-02 18:06:51 +00002569 def test_basics(self):
2570 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002571 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002572 with self.subTest(encoding=encoding):
2573 # generic codecs interface
2574 (o, size) = codecs.getencoder(encoding)(binput)
2575 self.assertEqual(size, len(binput))
2576 (i, size) = codecs.getdecoder(encoding)(o)
2577 self.assertEqual(size, len(o))
2578 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002579
Georg Brandl02524622010-12-02 18:06:51 +00002580 def test_read(self):
2581 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002582 with self.subTest(encoding=encoding):
2583 sin = codecs.encode(b"\x80", encoding)
2584 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2585 sout = reader.read()
2586 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002587
2588 def test_readline(self):
2589 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002590 with self.subTest(encoding=encoding):
2591 sin = codecs.encode(b"\x80", encoding)
2592 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2593 sout = reader.readline()
2594 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002595
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002596 def test_buffer_api_usage(self):
2597 # We check all the transform codecs accept memoryview input
2598 # for encoding and decoding
2599 # and also that they roundtrip correctly
2600 original = b"12345\x80"
2601 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002602 with self.subTest(encoding=encoding):
2603 data = original
2604 view = memoryview(data)
2605 data = codecs.encode(data, encoding)
2606 view_encoded = codecs.encode(view, encoding)
2607 self.assertEqual(view_encoded, data)
2608 view = memoryview(data)
2609 data = codecs.decode(data, encoding)
2610 self.assertEqual(data, original)
2611 view_decoded = codecs.decode(view, encoding)
2612 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002613
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002614 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002615 # Check binary -> binary codecs give a good error for str input
2616 bad_input = "bad input type"
2617 for encoding in bytes_transform_encodings:
2618 with self.subTest(encoding=encoding):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002619 fmt = ( "{!r} is not a text encoding; "
2620 "use codecs.encode\(\) to handle arbitrary codecs")
2621 msg = fmt.format(encoding)
2622 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002623 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002624 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002625
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002626 def test_text_to_binary_blacklists_text_transforms(self):
2627 # Check str.encode gives a good error message for str -> str codecs
2628 msg = (r"^'rot_13' is not a text encoding; "
2629 "use codecs.encode\(\) to handle arbitrary codecs")
2630 with self.assertRaisesRegex(LookupError, msg):
2631 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002632
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002633 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002634 # Check bytes.decode and bytearray.decode give a good error
2635 # message for binary -> binary codecs
2636 data = b"encode first to ensure we meet any format restrictions"
2637 for encoding in bytes_transform_encodings:
2638 with self.subTest(encoding=encoding):
2639 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002640 fmt = (r"{!r} is not a text encoding; "
2641 "use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002642 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002643 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002644 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002645 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002646 bytearray(encoded_data).decode(encoding)
2647
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002648 def test_binary_to_text_blacklists_text_transforms(self):
2649 # Check str -> str codec gives a good error for binary input
2650 for bad_input in (b"immutable", bytearray(b"mutable")):
2651 with self.subTest(bad_input=bad_input):
2652 msg = (r"^'rot_13' is not a text encoding; "
2653 "use codecs.decode\(\) to handle arbitrary codecs")
2654 with self.assertRaisesRegex(LookupError, msg) as failure:
2655 bad_input.decode("rot_13")
2656 self.assertIsNone(failure.exception.__cause__)
2657
Zachary Wareefa2e042013-12-30 14:54:11 -06002658 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002659 def test_custom_zlib_error_is_wrapped(self):
2660 # Check zlib codec gives a good error for malformed input
2661 msg = "^decoding with 'zlib_codec' codec failed"
2662 with self.assertRaisesRegex(Exception, msg) as failure:
2663 codecs.decode(b"hello", "zlib_codec")
2664 self.assertIsInstance(failure.exception.__cause__,
2665 type(failure.exception))
2666
2667 def test_custom_hex_error_is_wrapped(self):
2668 # Check hex codec gives a good error for malformed input
2669 msg = "^decoding with 'hex_codec' codec failed"
2670 with self.assertRaisesRegex(Exception, msg) as failure:
2671 codecs.decode(b"hello", "hex_codec")
2672 self.assertIsInstance(failure.exception.__cause__,
2673 type(failure.exception))
2674
2675 # Unfortunately, the bz2 module throws OSError, which the codec
2676 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002677
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002678 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2679 def test_aliases(self):
2680 for codec_name, aliases in transform_aliases.items():
2681 expected_name = codecs.lookup(codec_name).name
2682 for alias in aliases:
2683 with self.subTest(alias=alias):
2684 info = codecs.lookup(alias)
2685 self.assertEqual(info.name, expected_name)
2686
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002687 def test_uu_invalid(self):
2688 # Missing "begin" line
2689 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2690
Nick Coghlan8b097b42013-11-13 23:49:21 +10002691
2692# The codec system tries to wrap exceptions in order to ensure the error
2693# mentions the operation being performed and the codec involved. We
2694# currently *only* want this to happen for relatively stateless
2695# exceptions, where the only significant information they contain is their
2696# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002697
2698# Use a local codec registry to avoid appearing to leak objects when
2699# registering multiple seach functions
2700_TEST_CODECS = {}
2701
2702def _get_test_codec(codec_name):
2703 return _TEST_CODECS.get(codec_name)
2704codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2705
Nick Coghlan8fad1672014-09-15 23:50:44 +12002706try:
2707 # Issue #22166: Also need to clear the internal cache in CPython
2708 from _codecs import _forget_codec
2709except ImportError:
2710 def _forget_codec(codec_name):
2711 pass
2712
2713
Nick Coghlan8b097b42013-11-13 23:49:21 +10002714class ExceptionChainingTest(unittest.TestCase):
2715
2716 def setUp(self):
2717 # There's no way to unregister a codec search function, so we just
2718 # ensure we render this one fairly harmless after the test
2719 # case finishes by using the test case repr as the codec name
2720 # The codecs module normalizes codec names, although this doesn't
2721 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002722 # We also make sure we use a truly unique id for the custom codec
2723 # to avoid issues with the codec cache when running these tests
2724 # multiple times (e.g. when hunting for refleaks)
2725 unique_id = repr(self) + str(id(self))
2726 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2727
2728 # We store the object to raise on the instance because of a bad
2729 # interaction between the codec caching (which means we can't
2730 # recreate the codec entry) and regrtest refleak hunting (which
2731 # runs the same test instance multiple times). This means we
2732 # need to ensure the codecs call back in to the instance to find
2733 # out which exception to raise rather than binding them in a
2734 # closure to an object that may change on the next run
2735 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002736
Nick Coghlan4e553e22013-11-16 00:35:34 +10002737 def tearDown(self):
2738 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002739 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2740 encodings._cache.pop(self.codec_name, None)
2741 try:
2742 _forget_codec(self.codec_name)
2743 except KeyError:
2744 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002745
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002746 def set_codec(self, encode, decode):
2747 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002748 name=self.codec_name)
2749 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002750
2751 @contextlib.contextmanager
2752 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002753 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002754 operation, self.codec_name, exc_type.__name__, msg)
2755 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2756 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002757 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002758 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002759
2760 def raise_obj(self, *args, **kwds):
2761 # Helper to dynamically change the object raised by a test codec
2762 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002763
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002764 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002765 self.obj_to_raise = obj_to_raise
2766 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002767 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002768 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002769 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002770 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002771 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002772 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002773 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002774 codecs.decode(b"bytes input", self.codec_name)
2775
2776 def test_raise_by_type(self):
2777 self.check_wrapped(RuntimeError, "")
2778
2779 def test_raise_by_value(self):
2780 msg = "This should be wrapped"
2781 self.check_wrapped(RuntimeError(msg), msg)
2782
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002783 def test_raise_grandchild_subclass_exact_size(self):
2784 msg = "This should be wrapped"
2785 class MyRuntimeError(RuntimeError):
2786 __slots__ = ()
2787 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2788
2789 def test_raise_subclass_with_weakref_support(self):
2790 msg = "This should be wrapped"
2791 class MyRuntimeError(RuntimeError):
2792 pass
2793 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2794
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002795 def check_not_wrapped(self, obj_to_raise, msg):
2796 def raise_obj(*args, **kwds):
2797 raise obj_to_raise
2798 self.set_codec(raise_obj, raise_obj)
2799 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002800 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002801 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002802 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002803 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002804 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002805 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002806 codecs.decode(b"bytes input", self.codec_name)
2807
2808 def test_init_override_is_not_wrapped(self):
2809 class CustomInit(RuntimeError):
2810 def __init__(self):
2811 pass
2812 self.check_not_wrapped(CustomInit, "")
2813
2814 def test_new_override_is_not_wrapped(self):
2815 class CustomNew(RuntimeError):
2816 def __new__(cls):
2817 return super().__new__(cls)
2818 self.check_not_wrapped(CustomNew, "")
2819
2820 def test_instance_attribute_is_not_wrapped(self):
2821 msg = "This should NOT be wrapped"
2822 exc = RuntimeError(msg)
2823 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002824 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002825
2826 def test_non_str_arg_is_not_wrapped(self):
2827 self.check_not_wrapped(RuntimeError(1), "1")
2828
2829 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002830 msg_re = r"^\('a', 'b', 'c'\)$"
2831 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002832
2833 # http://bugs.python.org/issue19609
2834 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002835 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002836 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002837 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002838 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002839 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002840 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002841 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002842 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002843 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002844 codecs.decode(b"bytes input", self.codec_name)
2845
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002846 def test_unflagged_non_text_codec_handling(self):
2847 # The stdlib non-text codecs are now marked so they're
2848 # pre-emptively skipped by the text model related methods
2849 # However, third party codecs won't be flagged, so we still make
2850 # sure the case where an inappropriate output type is produced is
2851 # handled appropriately
2852 def encode_to_str(*args, **kwds):
2853 return "not bytes!", 0
2854 def decode_to_bytes(*args, **kwds):
2855 return b"not str!", 0
2856 self.set_codec(encode_to_str, decode_to_bytes)
2857 # No input or output type checks on the codecs module functions
2858 encoded = codecs.encode(None, self.codec_name)
2859 self.assertEqual(encoded, "not bytes!")
2860 decoded = codecs.decode(None, self.codec_name)
2861 self.assertEqual(decoded, b"not str!")
2862 # Text model methods should complain
2863 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2864 "use codecs.encode\(\) to encode to arbitrary types$")
2865 msg = fmt.format(self.codec_name)
2866 with self.assertRaisesRegex(TypeError, msg):
2867 "str_input".encode(self.codec_name)
2868 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2869 "use codecs.decode\(\) to decode to arbitrary types$")
2870 msg = fmt.format(self.codec_name)
2871 with self.assertRaisesRegex(TypeError, msg):
2872 b"bytes input".decode(self.codec_name)
2873
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002874
Georg Brandl02524622010-12-02 18:06:51 +00002875
Victor Stinner62be4fb2011-10-18 21:46:37 +02002876@unittest.skipUnless(sys.platform == 'win32',
2877 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002878class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002879 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002880 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002881
Victor Stinner3a50e702011-10-18 21:21:00 +02002882 def test_invalid_code_page(self):
2883 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2884 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002885 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2886 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002887
2888 def test_code_page_name(self):
2889 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2890 codecs.code_page_encode, 932, '\xff')
2891 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002892 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002893 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002894 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002895
2896 def check_decode(self, cp, tests):
2897 for raw, errors, expected in tests:
2898 if expected is not None:
2899 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01002900 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002901 except UnicodeDecodeError as err:
2902 self.fail('Unable to decode %a from "cp%s" with '
2903 'errors=%r: %s' % (raw, cp, errors, err))
2904 self.assertEqual(decoded[0], expected,
2905 '%a.decode("cp%s", %r)=%a != %a'
2906 % (raw, cp, errors, decoded[0], expected))
2907 # assert 0 <= decoded[1] <= len(raw)
2908 self.assertGreaterEqual(decoded[1], 0)
2909 self.assertLessEqual(decoded[1], len(raw))
2910 else:
2911 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01002912 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002913
2914 def check_encode(self, cp, tests):
2915 for text, errors, expected in tests:
2916 if expected is not None:
2917 try:
2918 encoded = codecs.code_page_encode(cp, text, errors)
2919 except UnicodeEncodeError as err:
2920 self.fail('Unable to encode %a to "cp%s" with '
2921 'errors=%r: %s' % (text, cp, errors, err))
2922 self.assertEqual(encoded[0], expected,
2923 '%a.encode("cp%s", %r)=%a != %a'
2924 % (text, cp, errors, encoded[0], expected))
2925 self.assertEqual(encoded[1], len(text))
2926 else:
2927 self.assertRaises(UnicodeEncodeError,
2928 codecs.code_page_encode, cp, text, errors)
2929
2930 def test_cp932(self):
2931 self.check_encode(932, (
2932 ('abc', 'strict', b'abc'),
2933 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002934 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002935 ('\xff', 'strict', None),
2936 ('[\xff]', 'ignore', b'[]'),
2937 ('[\xff]', 'replace', b'[y]'),
2938 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002939 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02002940 ('[\xff]', 'namereplace',
2941 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002942 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002943 ('\udcff', 'strict', None),
2944 ('[\udcff]', 'surrogateescape', b'[\xff]'),
2945 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002946 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002947 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002948 (b'abc', 'strict', 'abc'),
2949 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2950 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002951 (b'[\xff]', 'strict', None),
2952 (b'[\xff]', 'ignore', '[]'),
2953 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002954 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002955 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002956 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002957 (b'\x81\x00abc', 'strict', None),
2958 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002959 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01002960 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002961 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002962
2963 def test_cp1252(self):
2964 self.check_encode(1252, (
2965 ('abc', 'strict', b'abc'),
2966 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2967 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002968 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002969 ('\u0141', 'strict', None),
2970 ('\u0141', 'ignore', b''),
2971 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002972 ('\udc98', 'surrogateescape', b'\x98'),
2973 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002974 ))
2975 self.check_decode(1252, (
2976 (b'abc', 'strict', 'abc'),
2977 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2978 (b'\xff', 'strict', '\xff'),
2979 ))
2980
2981 def test_cp_utf7(self):
2982 cp = 65000
2983 self.check_encode(cp, (
2984 ('abc', 'strict', b'abc'),
2985 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2986 ('\U0010ffff', 'strict', b'+2//f/w-'),
2987 ('\udc80', 'strict', b'+3IA-'),
2988 ('\ufffd', 'strict', b'+//0-'),
2989 ))
2990 self.check_decode(cp, (
2991 (b'abc', 'strict', 'abc'),
2992 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2993 (b'+2//f/w-', 'strict', '\U0010ffff'),
2994 (b'+3IA-', 'strict', '\udc80'),
2995 (b'+//0-', 'strict', '\ufffd'),
2996 # invalid bytes
2997 (b'[+/]', 'strict', '[]'),
2998 (b'[\xff]', 'strict', '[\xff]'),
2999 ))
3000
Victor Stinner3a50e702011-10-18 21:21:00 +02003001 def test_multibyte_encoding(self):
3002 self.check_decode(932, (
3003 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3004 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3005 ))
3006 self.check_decode(self.CP_UTF8, (
3007 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3008 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3009 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003010 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02003011 self.check_encode(self.CP_UTF8, (
3012 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3013 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3014 ))
3015
3016 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003017 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3018 self.assertEqual(decoded, ('', 0))
3019
Victor Stinner3a50e702011-10-18 21:21:00 +02003020 decoded = codecs.code_page_decode(932,
3021 b'\xe9\x80\xe9', 'strict',
3022 False)
3023 self.assertEqual(decoded, ('\u9a3e', 2))
3024
3025 decoded = codecs.code_page_decode(932,
3026 b'\xe9\x80\xe9\x80', 'strict',
3027 False)
3028 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3029
3030 decoded = codecs.code_page_decode(932,
3031 b'abc', 'strict',
3032 False)
3033 self.assertEqual(decoded, ('abc', 3))
3034
3035
Fred Drake2e2be372001-09-20 21:33:42 +00003036if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003037 unittest.main()