blob: 5ba2c7bdc5f8d3925779be72891b4f9d39459680 [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
Nick Coghlanc72e4e62013-11-22 22:39:36 +10007import encodings
Victor Stinner91106cd2017-12-13 12:29:09 +01008from unittest import mock
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Antoine Pitrou00b2c862011-10-05 13:01:41 +020012try:
13 import ctypes
14except ImportError:
15 ctypes = None
16 SIZEOF_WCHAR_T = -1
17else:
18 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000019
Serhiy Storchakad6793772013-01-29 10:20:44 +020020def coding_checker(self, coder):
21 def check(input, expect):
22 self.assertEqual(coder(input), (expect, len(input)))
23 return check
24
Victor Stinnerf96418d2015-09-21 23:06:27 +020025
Walter Dörwald69652032004-09-07 20:24:22 +000026class Queue(object):
27 """
28 queue: write bytes at one end, read bytes from the other end
29 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000030 def __init__(self, buffer):
31 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000032
33 def write(self, chars):
34 self._buffer += chars
35
36 def read(self, size=-1):
37 if size<0:
38 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000039 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000040 return s
41 else:
42 s = self._buffer[:size]
43 self._buffer = self._buffer[size:]
44 return s
45
Victor Stinnerf96418d2015-09-21 23:06:27 +020046
Walter Dörwald3abcb012007-04-16 22:10:50 +000047class MixInCheckStateHandling:
48 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000049 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000050 d = codecs.getincrementaldecoder(encoding)()
51 part1 = d.decode(s[:i])
52 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000053 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000054 # Check that the condition stated in the documentation for
55 # IncrementalDecoder.getstate() holds
56 if not state[1]:
57 # reset decoder to the default state without anything buffered
58 d.setstate((state[0][:0], 0))
59 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000060 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000061 # The decoder must return to the same state
62 self.assertEqual(state, d.getstate())
63 # Create a new decoder and set it to the state
64 # we extracted from the old one
65 d = codecs.getincrementaldecoder(encoding)()
66 d.setstate(state)
67 part2 = d.decode(s[i:], True)
68 self.assertEqual(u, part1+part2)
69
70 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000071 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000072 d = codecs.getincrementalencoder(encoding)()
73 part1 = d.encode(u[:i])
74 state = d.getstate()
75 d = codecs.getincrementalencoder(encoding)()
76 d.setstate(state)
77 part2 = d.encode(u[i:], True)
78 self.assertEqual(s, part1+part2)
79
Victor Stinnerf96418d2015-09-21 23:06:27 +020080
Ezio Melotti5d3dba02013-01-11 06:02:07 +020081class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000082 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000083 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000084 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000085 # the StreamReader and check that the results equal the appropriate
86 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020088 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000089 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000090 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000091 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000092 result += r.read()
93 self.assertEqual(result, partialresult)
94 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000095 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000096 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000097
Martin Panter7462b6492015-11-02 03:37:02 +000098 # do the check again, this time using an incremental decoder
Thomas Woutersa9773292006-04-21 09:43:23 +000099 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000100 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000101 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000102 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000103 self.assertEqual(result, partialresult)
104 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000105 self.assertEqual(d.decode(b"", True), "")
106 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000107
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000108 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000109 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000110 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000111 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000112 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000113 self.assertEqual(result, partialresult)
114 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000115 self.assertEqual(d.decode(b"", True), "")
116 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000117
118 # check iterdecode()
119 encoded = input.encode(self.encoding)
120 self.assertEqual(
121 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000122 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000123 )
124
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000125 def test_readline(self):
126 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000127 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000128 return codecs.getreader(self.encoding)(stream)
129
Walter Dörwaldca199432006-03-06 22:39:12 +0000130 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200131 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000132 lines = []
133 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000134 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000135 if not line:
136 break
137 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000138 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000139
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000140 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
141 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
142 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000143 self.assertEqual(readalllines(s, True), sexpected)
144 self.assertEqual(readalllines(s, False), sexpectednoends)
145 self.assertEqual(readalllines(s, True, 10), sexpected)
146 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000147
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200148 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000149 # Test long lines (multiple calls to read() in readline())
150 vw = []
151 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200152 for (i, lineend) in enumerate(lineends):
153 vw.append((i*200+200)*"\u3042" + lineend)
154 vwo.append((i*200+200)*"\u3042")
155 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
156 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000157
158 # Test lines where the first read might end with \r, so the
159 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000160 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200161 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000162 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000163 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000164 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000165 self.assertEqual(
166 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000167 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000168 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200169 self.assertEqual(
170 reader.readline(keepends=True),
171 "xxx\n",
172 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000173 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000174 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000175 self.assertEqual(
176 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000177 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000178 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200179 self.assertEqual(
180 reader.readline(keepends=False),
181 "xxx",
182 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000183
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200184 def test_mixed_readline_and_read(self):
185 lines = ["Humpty Dumpty sat on a wall,\n",
186 "Humpty Dumpty had a great fall.\r\n",
187 "All the king's horses and all the king's men\r",
188 "Couldn't put Humpty together again."]
189 data = ''.join(lines)
190 def getreader():
191 stream = io.BytesIO(data.encode(self.encoding))
192 return codecs.getreader(self.encoding)(stream)
193
194 # Issue #8260: Test readline() followed by read()
195 f = getreader()
196 self.assertEqual(f.readline(), lines[0])
197 self.assertEqual(f.read(), ''.join(lines[1:]))
198 self.assertEqual(f.read(), '')
199
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200200 # Issue #32110: Test readline() followed by read(n)
201 f = getreader()
202 self.assertEqual(f.readline(), lines[0])
203 self.assertEqual(f.read(1), lines[1][0])
204 self.assertEqual(f.read(0), '')
205 self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100])
206
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200207 # Issue #16636: Test readline() followed by readlines()
208 f = getreader()
209 self.assertEqual(f.readline(), lines[0])
210 self.assertEqual(f.readlines(), lines[1:])
211 self.assertEqual(f.read(), '')
212
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200213 # Test read(n) followed by read()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200214 f = getreader()
215 self.assertEqual(f.read(size=40, chars=5), data[:5])
216 self.assertEqual(f.read(), data[5:])
217 self.assertEqual(f.read(), '')
218
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200219 # Issue #32110: Test read(n) followed by read(n)
220 f = getreader()
221 self.assertEqual(f.read(size=40, chars=5), data[:5])
222 self.assertEqual(f.read(1), data[5])
223 self.assertEqual(f.read(0), '')
224 self.assertEqual(f.read(100), data[6:106])
225
226 # Issue #12446: Test read(n) followed by readlines()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200227 f = getreader()
228 self.assertEqual(f.read(size=40, chars=5), data[:5])
229 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
230 self.assertEqual(f.read(), '')
231
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000232 def test_bug1175396(self):
233 s = [
234 '<%!--===================================================\r\n',
235 ' BLOG index page: show recent articles,\r\n',
236 ' today\'s articles, or articles of a specific date.\r\n',
237 '========================================================--%>\r\n',
238 '<%@inputencoding="ISO-8859-1"%>\r\n',
239 '<%@pagetemplate=TEMPLATE.y%>\r\n',
240 '<%@import=import frog.util, frog%>\r\n',
241 '<%@import=import frog.objects%>\r\n',
242 '<%@import=from frog.storageerrors import StorageError%>\r\n',
243 '<%\r\n',
244 '\r\n',
245 'import logging\r\n',
246 'log=logging.getLogger("Snakelets.logger")\r\n',
247 '\r\n',
248 '\r\n',
249 'user=self.SessionCtx.user\r\n',
250 'storageEngine=self.SessionCtx.storageEngine\r\n',
251 '\r\n',
252 '\r\n',
253 'def readArticlesFromDate(date, count=None):\r\n',
254 ' entryids=storageEngine.listBlogEntries(date)\r\n',
255 ' entryids.reverse() # descending\r\n',
256 ' if count:\r\n',
257 ' entryids=entryids[:count]\r\n',
258 ' try:\r\n',
259 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
260 ' except StorageError,x:\r\n',
261 ' log.error("Error loading articles: "+str(x))\r\n',
262 ' self.abort("cannot load articles")\r\n',
263 '\r\n',
264 'showdate=None\r\n',
265 '\r\n',
266 'arg=self.Request.getArg()\r\n',
267 'if arg=="today":\r\n',
268 ' #-------------------- TODAY\'S ARTICLES\r\n',
269 ' self.write("<h2>Today\'s articles</h2>")\r\n',
270 ' showdate = frog.util.isodatestr() \r\n',
271 ' entries = readArticlesFromDate(showdate)\r\n',
272 'elif arg=="active":\r\n',
273 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
274 ' self.Yredirect("active.y")\r\n',
275 'elif arg=="login":\r\n',
276 ' #-------------------- LOGIN PAGE redirect\r\n',
277 ' self.Yredirect("login.y")\r\n',
278 'elif arg=="date":\r\n',
279 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
280 ' showdate = self.Request.getParameter("date")\r\n',
281 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
282 ' entries = readArticlesFromDate(showdate)\r\n',
283 'else:\r\n',
284 ' #-------------------- RECENT ARTICLES\r\n',
285 ' self.write("<h2>Recent articles</h2>")\r\n',
286 ' dates=storageEngine.listBlogEntryDates()\r\n',
287 ' if dates:\r\n',
288 ' entries=[]\r\n',
289 ' SHOWAMOUNT=10\r\n',
290 ' for showdate in dates:\r\n',
291 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
292 ' if len(entries)>=SHOWAMOUNT:\r\n',
293 ' break\r\n',
294 ' \r\n',
295 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000296 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200297 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000298 for (i, line) in enumerate(reader):
299 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000300
301 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000302 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200303 writer = codecs.getwriter(self.encoding)(q)
304 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000305
306 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000307 writer.write("foo\r")
308 self.assertEqual(reader.readline(keepends=False), "foo")
309 writer.write("\nbar\r")
310 self.assertEqual(reader.readline(keepends=False), "")
311 self.assertEqual(reader.readline(keepends=False), "bar")
312 writer.write("baz")
313 self.assertEqual(reader.readline(keepends=False), "baz")
314 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000315
316 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000317 writer.write("foo\r")
318 self.assertEqual(reader.readline(keepends=True), "foo\r")
319 writer.write("\nbar\r")
320 self.assertEqual(reader.readline(keepends=True), "\n")
321 self.assertEqual(reader.readline(keepends=True), "bar\r")
322 writer.write("baz")
323 self.assertEqual(reader.readline(keepends=True), "baz")
324 self.assertEqual(reader.readline(keepends=True), "")
325 writer.write("foo\r\n")
326 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000327
Walter Dörwald9fa09462005-01-10 12:01:39 +0000328 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000329 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
330 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
331 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000332
333 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000334 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200335 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000336 self.assertEqual(reader.readline(), s1)
337 self.assertEqual(reader.readline(), s2)
338 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000339 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000340
341 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000342 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
343 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
344 s3 = "stillokay:bbbbxx\r\n"
345 s4 = "broken!!!!badbad\r\n"
346 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000347
348 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000349 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200350 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000351 self.assertEqual(reader.readline(), s1)
352 self.assertEqual(reader.readline(), s2)
353 self.assertEqual(reader.readline(), s3)
354 self.assertEqual(reader.readline(), s4)
355 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000356 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000357
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200358 ill_formed_sequence_replace = "\ufffd"
359
360 def test_lone_surrogates(self):
361 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
362 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
363 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200364 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
365 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200366 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
367 "[&#56448;]".encode(self.encoding))
368 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
369 "[]".encode(self.encoding))
370 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
371 "[?]".encode(self.encoding))
372
Victor Stinner01ada392015-10-01 21:54:51 +0200373 # sequential surrogate characters
374 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
375 "[]".encode(self.encoding))
376 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
377 "[??]".encode(self.encoding))
378
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200379 bom = "".encode(self.encoding)
380 for before, after in [("\U00010fff", "A"), ("[", "]"),
381 ("A", "\U00010fff")]:
382 before_sequence = before.encode(self.encoding)[len(bom):]
383 after_sequence = after.encode(self.encoding)[len(bom):]
384 test_string = before + "\uDC80" + after
385 test_sequence = (bom + before_sequence +
386 self.ill_formed_sequence + after_sequence)
387 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
388 self.encoding)
389 self.assertEqual(test_string.encode(self.encoding,
390 "surrogatepass"),
391 test_sequence)
392 self.assertEqual(test_sequence.decode(self.encoding,
393 "surrogatepass"),
394 test_string)
395 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
396 before + after)
397 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
398 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200399 backslashreplace = ''.join('\\x%02x' % b
400 for b in self.ill_formed_sequence)
401 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
402 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200403
Miss Islington (bot)bd482802019-03-30 06:52:41 -0700404 def test_incremental_surrogatepass(self):
405 # Test incremental decoder for surrogatepass handler:
406 # see issue #24214
407 data = '\uD901'.encode(self.encoding, 'surrogatepass')
408 for i in range(1, len(data)):
409 dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
410 self.assertEqual(dec.decode(data[:i]), '')
411 self.assertEqual(dec.decode(data[i:], True), '\uD901')
412
Victor Stinnerf96418d2015-09-21 23:06:27 +0200413
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200414class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000415 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200416 if sys.byteorder == 'little':
417 ill_formed_sequence = b"\x80\xdc\x00\x00"
418 else:
419 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000420
421 spamle = (b'\xff\xfe\x00\x00'
422 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
423 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
424 spambe = (b'\x00\x00\xfe\xff'
425 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
426 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
427
428 def test_only_one_bom(self):
429 _,_,reader,writer = codecs.lookup(self.encoding)
430 # encode some stream
431 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200432 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000433 f.write("spam")
434 f.write("spam")
435 d = s.getvalue()
436 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000437 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000438 # try to read it back
439 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200440 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000441 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000442
443 def test_badbom(self):
444 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200445 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000446 self.assertRaises(UnicodeError, f.read)
447
448 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200449 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000450 self.assertRaises(UnicodeError, f.read)
451
452 def test_partial(self):
453 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200454 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000455 [
456 "", # first byte of BOM read
457 "", # second byte of BOM read
458 "", # third byte of BOM read
459 "", # fourth byte of BOM read => byteorder known
460 "",
461 "",
462 "",
463 "\x00",
464 "\x00",
465 "\x00",
466 "\x00",
467 "\x00\xff",
468 "\x00\xff",
469 "\x00\xff",
470 "\x00\xff",
471 "\x00\xff\u0100",
472 "\x00\xff\u0100",
473 "\x00\xff\u0100",
474 "\x00\xff\u0100",
475 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200476 "\x00\xff\u0100\uffff",
477 "\x00\xff\u0100\uffff",
478 "\x00\xff\u0100\uffff",
479 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000480 ]
481 )
482
Georg Brandl791f4e12009-09-17 11:41:24 +0000483 def test_handlers(self):
484 self.assertEqual(('\ufffd', 1),
485 codecs.utf_32_decode(b'\x01', 'replace', True))
486 self.assertEqual(('', 1),
487 codecs.utf_32_decode(b'\x01', 'ignore', True))
488
Walter Dörwald41980ca2007-08-16 21:55:45 +0000489 def test_errors(self):
490 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
491 b"\xff", "strict", True)
492
493 def test_decoder_state(self):
494 self.check_state_handling_decode(self.encoding,
495 "spamspam", self.spamle)
496 self.check_state_handling_decode(self.encoding,
497 "spamspam", self.spambe)
498
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000499 def test_issue8941(self):
500 # Issue #8941: insufficient result allocation when decoding into
501 # surrogate pairs on UCS-2 builds.
502 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
503 self.assertEqual('\U00010000' * 1024,
504 codecs.utf_32_decode(encoded_le)[0])
505 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
506 self.assertEqual('\U00010000' * 1024,
507 codecs.utf_32_decode(encoded_be)[0])
508
Victor Stinnerf96418d2015-09-21 23:06:27 +0200509
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200510class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000511 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200512 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000513
514 def test_partial(self):
515 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200516 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000517 [
518 "",
519 "",
520 "",
521 "\x00",
522 "\x00",
523 "\x00",
524 "\x00",
525 "\x00\xff",
526 "\x00\xff",
527 "\x00\xff",
528 "\x00\xff",
529 "\x00\xff\u0100",
530 "\x00\xff\u0100",
531 "\x00\xff\u0100",
532 "\x00\xff\u0100",
533 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200534 "\x00\xff\u0100\uffff",
535 "\x00\xff\u0100\uffff",
536 "\x00\xff\u0100\uffff",
537 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000538 ]
539 )
540
541 def test_simple(self):
542 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
543
544 def test_errors(self):
545 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
546 b"\xff", "strict", True)
547
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000548 def test_issue8941(self):
549 # Issue #8941: insufficient result allocation when decoding into
550 # surrogate pairs on UCS-2 builds.
551 encoded = b'\x00\x00\x01\x00' * 1024
552 self.assertEqual('\U00010000' * 1024,
553 codecs.utf_32_le_decode(encoded)[0])
554
Victor Stinnerf96418d2015-09-21 23:06:27 +0200555
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200556class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000557 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200558 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000559
560 def test_partial(self):
561 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200562 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000563 [
564 "",
565 "",
566 "",
567 "\x00",
568 "\x00",
569 "\x00",
570 "\x00",
571 "\x00\xff",
572 "\x00\xff",
573 "\x00\xff",
574 "\x00\xff",
575 "\x00\xff\u0100",
576 "\x00\xff\u0100",
577 "\x00\xff\u0100",
578 "\x00\xff\u0100",
579 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200580 "\x00\xff\u0100\uffff",
581 "\x00\xff\u0100\uffff",
582 "\x00\xff\u0100\uffff",
583 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000584 ]
585 )
586
587 def test_simple(self):
588 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
589
590 def test_errors(self):
591 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
592 b"\xff", "strict", True)
593
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000594 def test_issue8941(self):
595 # Issue #8941: insufficient result allocation when decoding into
596 # surrogate pairs on UCS-2 builds.
597 encoded = b'\x00\x01\x00\x00' * 1024
598 self.assertEqual('\U00010000' * 1024,
599 codecs.utf_32_be_decode(encoded)[0])
600
601
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200602class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000603 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200604 if sys.byteorder == 'little':
605 ill_formed_sequence = b"\x80\xdc"
606 else:
607 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000608
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000609 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
610 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000611
612 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000613 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000614 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000615 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200616 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000617 f.write("spam")
618 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000619 d = s.getvalue()
620 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000621 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000622 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000623 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200624 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000625 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000626
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000627 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000628 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200629 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000630 self.assertRaises(UnicodeError, f.read)
631
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000632 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200633 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000634 self.assertRaises(UnicodeError, f.read)
635
Walter Dörwald69652032004-09-07 20:24:22 +0000636 def test_partial(self):
637 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200638 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000639 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000640 "", # first byte of BOM read
641 "", # second byte of BOM read => byteorder known
642 "",
643 "\x00",
644 "\x00",
645 "\x00\xff",
646 "\x00\xff",
647 "\x00\xff\u0100",
648 "\x00\xff\u0100",
649 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200650 "\x00\xff\u0100\uffff",
651 "\x00\xff\u0100\uffff",
652 "\x00\xff\u0100\uffff",
653 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000654 ]
655 )
656
Georg Brandl791f4e12009-09-17 11:41:24 +0000657 def test_handlers(self):
658 self.assertEqual(('\ufffd', 1),
659 codecs.utf_16_decode(b'\x01', 'replace', True))
660 self.assertEqual(('', 1),
661 codecs.utf_16_decode(b'\x01', 'ignore', True))
662
Walter Dörwalde22d3392005-11-17 08:52:34 +0000663 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000664 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000665 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000666
667 def test_decoder_state(self):
668 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000669 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000670 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000671 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000672
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000673 def test_bug691291(self):
674 # Files are always opened in binary mode, even if no binary mode was
675 # specified. This means that no automatic conversion of '\n' is done
676 # on reading and writing.
677 s1 = 'Hello\r\nworld\r\n'
678
679 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200680 self.addCleanup(support.unlink, support.TESTFN)
681 with open(support.TESTFN, 'wb') as fp:
682 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200683 with support.check_warnings(('', DeprecationWarning)):
684 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
685 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200686 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000687
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200688class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000689 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200690 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000691
692 def test_partial(self):
693 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200694 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000695 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000696 "",
697 "\x00",
698 "\x00",
699 "\x00\xff",
700 "\x00\xff",
701 "\x00\xff\u0100",
702 "\x00\xff\u0100",
703 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200704 "\x00\xff\u0100\uffff",
705 "\x00\xff\u0100\uffff",
706 "\x00\xff\u0100\uffff",
707 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000708 ]
709 )
710
Walter Dörwalde22d3392005-11-17 08:52:34 +0000711 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200712 tests = [
713 (b'\xff', '\ufffd'),
714 (b'A\x00Z', 'A\ufffd'),
715 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
716 (b'\x00\xd8', '\ufffd'),
717 (b'\x00\xd8A', '\ufffd'),
718 (b'\x00\xd8A\x00', '\ufffdA'),
719 (b'\x00\xdcA\x00', '\ufffdA'),
720 ]
721 for raw, expected in tests:
722 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
723 raw, 'strict', True)
724 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000725
Victor Stinner53a9dd72010-12-08 22:25:45 +0000726 def test_nonbmp(self):
727 self.assertEqual("\U00010203".encode(self.encoding),
728 b'\x00\xd8\x03\xde')
729 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
730 "\U00010203")
731
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200732class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000733 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200734 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000735
736 def test_partial(self):
737 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200738 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000739 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000740 "",
741 "\x00",
742 "\x00",
743 "\x00\xff",
744 "\x00\xff",
745 "\x00\xff\u0100",
746 "\x00\xff\u0100",
747 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200748 "\x00\xff\u0100\uffff",
749 "\x00\xff\u0100\uffff",
750 "\x00\xff\u0100\uffff",
751 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000752 ]
753 )
754
Walter Dörwalde22d3392005-11-17 08:52:34 +0000755 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200756 tests = [
757 (b'\xff', '\ufffd'),
758 (b'\x00A\xff', 'A\ufffd'),
759 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
760 (b'\xd8\x00', '\ufffd'),
761 (b'\xd8\x00\xdc', '\ufffd'),
762 (b'\xd8\x00\x00A', '\ufffdA'),
763 (b'\xdc\x00\x00A', '\ufffdA'),
764 ]
765 for raw, expected in tests:
766 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
767 raw, 'strict', True)
768 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000769
Victor Stinner53a9dd72010-12-08 22:25:45 +0000770 def test_nonbmp(self):
771 self.assertEqual("\U00010203".encode(self.encoding),
772 b'\xd8\x00\xde\x03')
773 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
774 "\U00010203")
775
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200776class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000777 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200778 ill_formed_sequence = b"\xed\xb2\x80"
779 ill_formed_sequence_replace = "\ufffd" * 3
Victor Stinner01ada392015-10-01 21:54:51 +0200780 BOM = b''
Walter Dörwald69652032004-09-07 20:24:22 +0000781
782 def test_partial(self):
783 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200784 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000785 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000786 "\x00",
787 "\x00",
788 "\x00\xff",
789 "\x00\xff",
790 "\x00\xff\u07ff",
791 "\x00\xff\u07ff",
792 "\x00\xff\u07ff",
793 "\x00\xff\u07ff\u0800",
794 "\x00\xff\u07ff\u0800",
795 "\x00\xff\u07ff\u0800",
796 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200797 "\x00\xff\u07ff\u0800\uffff",
798 "\x00\xff\u07ff\u0800\uffff",
799 "\x00\xff\u07ff\u0800\uffff",
800 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000801 ]
802 )
803
Walter Dörwald3abcb012007-04-16 22:10:50 +0000804 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000805 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000806 self.check_state_handling_decode(self.encoding,
807 u, u.encode(self.encoding))
808
Victor Stinner1d65d912015-10-05 13:43:50 +0200809 def test_decode_error(self):
810 for data, error_handler, expected in (
811 (b'[\x80\xff]', 'ignore', '[]'),
812 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
813 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
814 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
815 ):
816 with self.subTest(data=data, error_handler=error_handler,
817 expected=expected):
818 self.assertEqual(data.decode(self.encoding, error_handler),
819 expected)
820
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000821 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200822 super().test_lone_surrogates()
823 # not sure if this is making sense for
824 # UTF-16 and UTF-32
Victor Stinner01ada392015-10-01 21:54:51 +0200825 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
826 self.BOM + b'[\x80]')
827
828 with self.assertRaises(UnicodeEncodeError) as cm:
829 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
830 exc = cm.exception
831 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000832
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000833 def test_surrogatepass_handler(self):
Victor Stinner01ada392015-10-01 21:54:51 +0200834 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
835 self.BOM + b"abc\xed\xa0\x80def")
836 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
837 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
838 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
839 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
840
841 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
Ezio Melottib3aedd42010-11-20 19:04:17 +0000842 "abc\ud800def")
Victor Stinner01ada392015-10-01 21:54:51 +0200843 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200844 "\U00010fff\uD800")
Victor Stinner01ada392015-10-01 21:54:51 +0200845
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000846 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700847 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200848 b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200849 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200850 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000851
Victor Stinnerf96418d2015-09-21 23:06:27 +0200852
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200853@unittest.skipUnless(sys.platform == 'win32',
854 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200855class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200856 encoding = "cp65001"
857
858 def test_encode(self):
859 tests = [
860 ('abc', 'strict', b'abc'),
861 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
862 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
Steve Dowerf5aba582016-09-06 19:42:27 -0700863 ('\udc80', 'strict', None),
864 ('\udc80', 'ignore', b''),
865 ('\udc80', 'replace', b'?'),
866 ('\udc80', 'backslashreplace', b'\\udc80'),
867 ('\udc80', 'namereplace', b'\\udc80'),
868 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200869 ]
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200870 for text, errors, expected in tests:
871 if expected is not None:
872 try:
873 encoded = text.encode('cp65001', errors)
874 except UnicodeEncodeError as err:
875 self.fail('Unable to encode %a to cp65001 with '
876 'errors=%r: %s' % (text, errors, err))
877 self.assertEqual(encoded, expected,
878 '%a.encode("cp65001", %r)=%a != %a'
879 % (text, errors, encoded, expected))
880 else:
881 self.assertRaises(UnicodeEncodeError,
882 text.encode, "cp65001", errors)
883
884 def test_decode(self):
885 tests = [
886 (b'abc', 'strict', 'abc'),
887 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
888 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
889 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
890 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
891 # invalid bytes
892 (b'[\xff]', 'strict', None),
893 (b'[\xff]', 'ignore', '[]'),
894 (b'[\xff]', 'replace', '[\ufffd]'),
895 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Steve Dowerf5aba582016-09-06 19:42:27 -0700896 (b'[\xed\xb2\x80]', 'strict', None),
897 (b'[\xed\xb2\x80]', 'ignore', '[]'),
898 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200899 ]
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200900 for raw, errors, expected in tests:
901 if expected is not None:
902 try:
903 decoded = raw.decode('cp65001', errors)
904 except UnicodeDecodeError as err:
905 self.fail('Unable to decode %a from cp65001 with '
906 'errors=%r: %s' % (raw, errors, err))
907 self.assertEqual(decoded, expected,
908 '%a.decode("cp65001", %r)=%a != %a'
909 % (raw, errors, decoded, expected))
910 else:
911 self.assertRaises(UnicodeDecodeError,
912 raw.decode, 'cp65001', errors)
913
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200914 def test_lone_surrogates(self):
915 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
916 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
917 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
918 b'[\\udc80]')
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200919 self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
920 b'[\\udc80]')
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200921 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
922 b'[&#56448;]')
923 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
924 b'[\x80]')
925 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
926 b'[]')
927 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
928 b'[?]')
929
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200930 def test_surrogatepass_handler(self):
931 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
932 b"abc\xed\xa0\x80def")
933 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
934 "abc\ud800def")
935 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
936 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
937 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
938 "\U00010fff\uD800")
939 self.assertTrue(codecs.lookup_error("surrogatepass"))
940
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200941
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200942class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000943 encoding = "utf-7"
944
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300945 def test_ascii(self):
946 # Set D (directly encoded characters)
947 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
948 'abcdefghijklmnopqrstuvwxyz'
949 '0123456789'
950 '\'(),-./:?')
951 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
952 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
953 # Set O (optional direct characters)
954 set_o = ' !"#$%&*;<=>@[]^_`{|}'
955 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
956 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
957 # +
958 self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
959 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
960 # White spaces
961 ws = ' \t\n\r'
962 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
963 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
964 # Other ASCII characters
965 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
966 set(set_d + set_o + '+' + ws)))
967 self.assertEqual(other_ascii.encode(self.encoding),
968 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
969 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
970
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000971 def test_partial(self):
972 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200973 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000974 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200975 'a',
976 'a',
977 'a+',
978 'a+-',
979 'a+-b',
980 'a+-b',
981 'a+-b',
982 'a+-b',
983 'a+-b',
984 'a+-b\x00',
985 'a+-b\x00c',
986 'a+-b\x00c',
987 'a+-b\x00c',
988 'a+-b\x00c',
989 'a+-b\x00c',
990 'a+-b\x00c\x80',
991 'a+-b\x00c\x80d',
992 'a+-b\x00c\x80d',
993 'a+-b\x00c\x80d',
994 'a+-b\x00c\x80d',
995 'a+-b\x00c\x80d',
996 'a+-b\x00c\x80d\u0100',
997 'a+-b\x00c\x80d\u0100e',
998 'a+-b\x00c\x80d\u0100e',
999 'a+-b\x00c\x80d\u0100e',
1000 'a+-b\x00c\x80d\u0100e',
1001 'a+-b\x00c\x80d\u0100e',
1002 'a+-b\x00c\x80d\u0100e',
1003 'a+-b\x00c\x80d\u0100e',
1004 'a+-b\x00c\x80d\u0100e',
1005 'a+-b\x00c\x80d\u0100e\U00010000',
1006 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001007 ]
1008 )
Walter Dörwalde22d3392005-11-17 08:52:34 +00001009
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001010 def test_errors(self):
1011 tests = [
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001012 (b'\xffb', '\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001013 (b'a\xffb', 'a\ufffdb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001014 (b'a\xff\xffb', 'a\ufffd\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001015 (b'a+IK', 'a\ufffd'),
1016 (b'a+IK-b', 'a\ufffdb'),
1017 (b'a+IK,b', 'a\ufffdb'),
1018 (b'a+IKx', 'a\u20ac\ufffd'),
1019 (b'a+IKx-b', 'a\u20ac\ufffdb'),
1020 (b'a+IKwgr', 'a\u20ac\ufffd'),
1021 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
1022 (b'a+IKwgr,', 'a\u20ac\ufffd'),
1023 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
1024 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
1025 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
1026 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
1027 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
1028 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
1029 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001030 (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
1031 (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001032 ]
1033 for raw, expected in tests:
1034 with self.subTest(raw=raw):
1035 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
1036 raw, 'strict', True)
1037 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
1038
1039 def test_nonbmp(self):
1040 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1041 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1042 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001043 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1044 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1045 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1046 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1047 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1048 b'+IKwgrNgB3KA-')
1049 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1050 '\u20ac\u20ac\U000104A0')
1051 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1052 '\u20ac\u20ac\U000104A0')
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001053
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001054 def test_lone_surrogates(self):
1055 tests = [
1056 (b'a+2AE-b', 'a\ud801b'),
1057 (b'a+2AE\xffb', 'a\ufffdb'),
1058 (b'a+2AE', 'a\ufffd'),
1059 (b'a+2AEA-b', 'a\ufffdb'),
1060 (b'a+2AH-b', 'a\ufffdb'),
1061 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1062 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1063 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1064 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1065 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1066 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1067 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1068 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1069 ]
1070 for raw, expected in tests:
1071 with self.subTest(raw=raw):
1072 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001073
1074
Walter Dörwalde22d3392005-11-17 08:52:34 +00001075class UTF16ExTest(unittest.TestCase):
1076
1077 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001078 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001079
1080 def test_bad_args(self):
1081 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1082
1083class ReadBufferTest(unittest.TestCase):
1084
1085 def test_array(self):
1086 import array
1087 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001088 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +00001089 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001090 )
1091
1092 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +00001093 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +00001094
1095 def test_bad_args(self):
1096 self.assertRaises(TypeError, codecs.readbuffer_encode)
1097 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1098
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001099class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001100 encoding = "utf-8-sig"
Victor Stinner01ada392015-10-01 21:54:51 +02001101 BOM = codecs.BOM_UTF8
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001102
1103 def test_partial(self):
1104 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001105 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001106 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001107 "",
1108 "",
1109 "", # First BOM has been read and skipped
1110 "",
1111 "",
1112 "\ufeff", # Second BOM has been read and emitted
1113 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001114 "\ufeff\x00", # First byte of encoded "\xff" read
1115 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1116 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1117 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001118 "\ufeff\x00\xff\u07ff",
1119 "\ufeff\x00\xff\u07ff",
1120 "\ufeff\x00\xff\u07ff\u0800",
1121 "\ufeff\x00\xff\u07ff\u0800",
1122 "\ufeff\x00\xff\u07ff\u0800",
1123 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001124 "\ufeff\x00\xff\u07ff\u0800\uffff",
1125 "\ufeff\x00\xff\u07ff\u0800\uffff",
1126 "\ufeff\x00\xff\u07ff\u0800\uffff",
1127 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001128 ]
1129 )
1130
Thomas Wouters89f507f2006-12-13 04:49:30 +00001131 def test_bug1601501(self):
1132 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001133 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001134
Walter Dörwald3abcb012007-04-16 22:10:50 +00001135 def test_bom(self):
1136 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001137 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001138 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1139
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001140 def test_stream_bom(self):
1141 unistring = "ABC\u00A1\u2200XYZ"
1142 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1143
1144 reader = codecs.getreader("utf-8-sig")
1145 for sizehint in [None] + list(range(1, 11)) + \
1146 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001147 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001148 ostream = io.StringIO()
1149 while 1:
1150 if sizehint is not None:
1151 data = istream.read(sizehint)
1152 else:
1153 data = istream.read()
1154
1155 if not data:
1156 break
1157 ostream.write(data)
1158
1159 got = ostream.getvalue()
1160 self.assertEqual(got, unistring)
1161
1162 def test_stream_bare(self):
1163 unistring = "ABC\u00A1\u2200XYZ"
1164 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1165
1166 reader = codecs.getreader("utf-8-sig")
1167 for sizehint in [None] + list(range(1, 11)) + \
1168 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001169 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001170 ostream = io.StringIO()
1171 while 1:
1172 if sizehint is not None:
1173 data = istream.read(sizehint)
1174 else:
1175 data = istream.read()
1176
1177 if not data:
1178 break
1179 ostream.write(data)
1180
1181 got = ostream.getvalue()
1182 self.assertEqual(got, unistring)
1183
1184class EscapeDecodeTest(unittest.TestCase):
1185 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001186 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001187 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001188
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001189 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001190 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001191 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001192 b = bytes([b])
1193 if b != b'\\':
1194 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001195
1196 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001197 decode = codecs.escape_decode
1198 check = coding_checker(self, decode)
1199 check(b"[\\\n]", b"[]")
1200 check(br'[\"]', b'["]')
1201 check(br"[\']", b"[']")
R David Murray110b6fe2016-09-08 15:34:08 -04001202 check(br"[\\]", b"[\\]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001203 check(br"[\a]", b"[\x07]")
1204 check(br"[\b]", b"[\x08]")
1205 check(br"[\t]", b"[\x09]")
1206 check(br"[\n]", b"[\x0a]")
1207 check(br"[\v]", b"[\x0b]")
1208 check(br"[\f]", b"[\x0c]")
1209 check(br"[\r]", b"[\x0d]")
1210 check(br"[\7]", b"[\x07]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001211 check(br"[\78]", b"[\x078]")
1212 check(br"[\41]", b"[!]")
1213 check(br"[\418]", b"[!8]")
1214 check(br"[\101]", b"[A]")
1215 check(br"[\1010]", b"[A0]")
1216 check(br"[\501]", b"[A]")
1217 check(br"[\x41]", b"[A]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001218 check(br"[\x410]", b"[A0]")
R David Murray110b6fe2016-09-08 15:34:08 -04001219 for i in range(97, 123):
1220 b = bytes([i])
1221 if b not in b'abfnrtvx':
1222 with self.assertWarns(DeprecationWarning):
1223 check(b"\\" + b, b"\\" + b)
1224 with self.assertWarns(DeprecationWarning):
1225 check(b"\\" + b.upper(), b"\\" + b.upper())
1226 with self.assertWarns(DeprecationWarning):
1227 check(br"\8", b"\\8")
1228 with self.assertWarns(DeprecationWarning):
1229 check(br"\9", b"\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03001230 with self.assertWarns(DeprecationWarning):
1231 check(b"\\\xfa", b"\\\xfa")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001232
1233 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001234 decode = codecs.escape_decode
1235 self.assertRaises(ValueError, decode, br"\x")
1236 self.assertRaises(ValueError, decode, br"[\x]")
1237 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1238 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1239 self.assertRaises(ValueError, decode, br"\x0")
1240 self.assertRaises(ValueError, decode, br"[\x0]")
1241 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1242 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001243
Victor Stinnerf96418d2015-09-21 23:06:27 +02001244
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001245class RecodingTest(unittest.TestCase):
1246 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001247 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001248 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001249 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001250 f2.close()
1251 # Python used to crash on this at exit because of a refcount
1252 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001253
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001254 self.assertTrue(f.closed)
1255
Martin v. Löwis2548c732003-04-18 10:39:54 +00001256# From RFC 3492
1257punycode_testcases = [
1258 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001259 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1260 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001261 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001262 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001263 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001264 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001265 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001266 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001267 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001268 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001269 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1270 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1271 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001272 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001273 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001274 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1275 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1276 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001277 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001278 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001279 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001280 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1281 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1282 "\u0939\u0948\u0902",
1283 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001284
1285 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001286 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001287 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1288 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001289
1290 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001291 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1292 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1293 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001294 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1295 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001296
1297 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001298 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1299 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1300 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1301 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001302 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001303
1304 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001305 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1306 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1307 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1308 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1309 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001310 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001311
1312 # (K) Vietnamese:
1313 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1314 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001315 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1316 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1317 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1318 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001319 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001320
Martin v. Löwis2548c732003-04-18 10:39:54 +00001321 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001322 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001323 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001324
Martin v. Löwis2548c732003-04-18 10:39:54 +00001325 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001326 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1327 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1328 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001329 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001330
1331 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001332 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1333 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1334 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001335 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001336
1337 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001338 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001339 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001340
1341 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001342 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1343 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001344 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001345
1346 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001347 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001348 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001349
1350 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001351 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001352 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001353
1354 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001355 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1356 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001357 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001358 ]
1359
1360for i in punycode_testcases:
1361 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001362 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001363
Victor Stinnerf96418d2015-09-21 23:06:27 +02001364
Martin v. Löwis2548c732003-04-18 10:39:54 +00001365class PunycodeTest(unittest.TestCase):
1366 def test_encode(self):
1367 for uni, puny in punycode_testcases:
1368 # Need to convert both strings to lower case, since
1369 # some of the extended encodings use upper case, but our
1370 # code produces only lower case. Converting just puny to
1371 # lower is also insufficient, since some of the input characters
1372 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001373 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001374 str(uni.encode("punycode"), "ascii").lower(),
1375 str(puny, "ascii").lower()
1376 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001377
1378 def test_decode(self):
1379 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001380 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001381 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001382 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001383
Victor Stinnerf96418d2015-09-21 23:06:27 +02001384
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001385class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001386 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001387 def test_bug1251300(self):
1388 # Decoding with unicode_internal used to not correctly handle "code
1389 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001390 ok = [
1391 (b"\x00\x10\xff\xff", "\U0010ffff"),
1392 (b"\x00\x00\x01\x01", "\U00000101"),
1393 (b"", ""),
1394 ]
1395 not_ok = [
1396 b"\x7f\xff\xff\xff",
1397 b"\x80\x00\x00\x00",
1398 b"\x81\x00\x00\x00",
1399 b"\x00",
1400 b"\x00\x00\x00\x00\x00",
1401 ]
1402 for internal, uni in ok:
1403 if sys.byteorder == "little":
1404 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001405 with support.check_warnings():
1406 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001407 for internal in not_ok:
1408 if sys.byteorder == "little":
1409 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001410 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001411 'deprecated', DeprecationWarning)):
1412 self.assertRaises(UnicodeDecodeError, internal.decode,
1413 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001414 if sys.byteorder == "little":
1415 invalid = b"\x00\x00\x11\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001416 invalid_backslashreplace = r"\x00\x00\x11\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001417 else:
1418 invalid = b"\x00\x11\x00\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001419 invalid_backslashreplace = r"\x00\x11\x00\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001420 with support.check_warnings():
1421 self.assertRaises(UnicodeDecodeError,
1422 invalid.decode, "unicode_internal")
1423 with support.check_warnings():
1424 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1425 '\ufffd')
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001426 with support.check_warnings():
1427 self.assertEqual(invalid.decode("unicode_internal", "backslashreplace"),
1428 invalid_backslashreplace)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001429
Victor Stinner182d90d2011-09-29 19:53:55 +02001430 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001431 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001432 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001433 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001434 'deprecated', DeprecationWarning)):
1435 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001436 except UnicodeDecodeError as ex:
1437 self.assertEqual("unicode_internal", ex.encoding)
1438 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1439 self.assertEqual(4, ex.start)
1440 self.assertEqual(8, ex.end)
1441 else:
1442 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001443
Victor Stinner182d90d2011-09-29 19:53:55 +02001444 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001445 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001446 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1447 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001448 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001449 'deprecated', DeprecationWarning)):
1450 ab = "ab".encode("unicode_internal").decode()
1451 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1452 "ascii"),
1453 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001454 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001455
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001456 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001457 with support.check_warnings(('unicode_internal codec has been '
1458 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001459 # Issue 3739
1460 encoder = codecs.getencoder("unicode_internal")
1461 self.assertEqual(encoder("a")[1], 1)
1462 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1463
1464 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001465
Martin v. Löwis2548c732003-04-18 10:39:54 +00001466# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1467nameprep_tests = [
1468 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001469 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1470 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1471 b'\xb8\x8f\xef\xbb\xbf',
1472 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001473 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001474 (b'CAFE',
1475 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001476 # 3.3 Case folding 8bit U+00DF (german sharp s).
1477 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001478 (b'\xc3\x9f',
1479 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001480 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001481 (b'\xc4\xb0',
1482 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001483 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001484 (b'\xc5\x83\xcd\xba',
1485 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001486 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1487 # XXX: skip this as it fails in UCS-2 mode
1488 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1489 # 'telc\xe2\x88\x95kg\xcf\x83'),
1490 (None, None),
1491 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001492 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1493 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001494 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001495 (b'\xe1\xbe\xb7',
1496 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001497 # 3.9 Self-reverting case folding U+01F0 and normalization.
1498 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001499 (b'\xc7\xb0',
1500 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001501 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001502 (b'\xce\x90',
1503 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001504 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001505 (b'\xce\xb0',
1506 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001507 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001508 (b'\xe1\xba\x96',
1509 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001510 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001511 (b'\xe1\xbd\x96',
1512 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001513 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001514 (b' ',
1515 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001516 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001517 (b'\xc2\xa0',
1518 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001519 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001520 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001521 None),
1522 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001523 (b'\xe2\x80\x80',
1524 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001525 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001526 (b'\xe2\x80\x8b',
1527 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001528 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001529 (b'\xe3\x80\x80',
1530 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001531 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001532 (b'\x10\x7f',
1533 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001534 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001535 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001536 None),
1537 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001538 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001539 None),
1540 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001541 (b'\xef\xbb\xbf',
1542 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001543 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001544 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001545 None),
1546 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001547 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001548 None),
1549 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001550 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001551 None),
1552 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001553 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001554 None),
1555 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001556 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001557 None),
1558 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001559 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001560 None),
1561 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001562 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001563 None),
1564 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001565 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001566 None),
1567 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001568 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001569 None),
1570 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001571 (b'\xcd\x81',
1572 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001573 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001574 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001575 None),
1576 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001577 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001578 None),
1579 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001580 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001581 None),
1582 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001583 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001584 None),
1585 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001586 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001587 None),
1588 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001589 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001590 None),
1591 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001592 (b'foo\xef\xb9\xb6bar',
1593 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001594 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001595 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001596 None),
1597 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001598 (b'\xd8\xa71\xd8\xa8',
1599 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001600 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001601 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001602 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001603 # None),
1604 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001605 # 3.44 Larger test (shrinking).
1606 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001607 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1608 b'\xaa\xce\xb0\xe2\x80\x80',
1609 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001610 # 3.45 Larger test (expanding).
1611 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001612 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1613 b'\x80',
1614 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1615 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1616 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001617 ]
1618
1619
1620class NameprepTest(unittest.TestCase):
1621 def test_nameprep(self):
1622 from encodings.idna import nameprep
1623 for pos, (orig, prepped) in enumerate(nameprep_tests):
1624 if orig is None:
1625 # Skipped
1626 continue
1627 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001628 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001629 if prepped is None:
1630 # Input contains prohibited characters
1631 self.assertRaises(UnicodeError, nameprep, orig)
1632 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001633 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001634 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001635 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001636 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001637 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001638
Victor Stinnerf96418d2015-09-21 23:06:27 +02001639
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001640class IDNACodecTest(unittest.TestCase):
1641 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001642 self.assertEqual(str(b"python.org", "idna"), "python.org")
1643 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1644 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1645 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001646
1647 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001648 self.assertEqual("python.org".encode("idna"), b"python.org")
1649 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1650 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1651 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001652
Martin v. Löwis8b595142005-08-25 11:03:38 +00001653 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001654 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001655 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001656 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001657
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001658 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001659 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001660 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001661 "python.org"
1662 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001663 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001664 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001665 "python.org."
1666 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001667 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001668 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001669 "pyth\xf6n.org."
1670 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001671 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001672 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001673 "pyth\xf6n.org."
1674 )
1675
1676 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001677 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1678 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1679 self.assertEqual(decoder.decode(b"rg"), "")
1680 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001681
1682 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001683 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1684 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1685 self.assertEqual(decoder.decode(b"rg."), "org.")
1686 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001687
1688 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001689 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001690 b"".join(codecs.iterencode("python.org", "idna")),
1691 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001692 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001693 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001694 b"".join(codecs.iterencode("python.org.", "idna")),
1695 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001696 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001697 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001698 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1699 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001700 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001701 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001702 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1703 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001704 )
1705
1706 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001707 self.assertEqual(encoder.encode("\xe4x"), b"")
1708 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1709 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001710
1711 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001712 self.assertEqual(encoder.encode("\xe4x"), b"")
1713 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1714 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001715
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001716 def test_errors(self):
1717 """Only supports "strict" error handler"""
1718 "python.org".encode("idna", "strict")
1719 b"python.org".decode("idna", "strict")
1720 for errors in ("ignore", "replace", "backslashreplace",
1721 "surrogateescape"):
1722 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1723 self.assertRaises(Exception,
1724 b"python.org".decode, "idna", errors)
1725
Victor Stinnerf96418d2015-09-21 23:06:27 +02001726
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001727class CodecsModuleTest(unittest.TestCase):
1728
1729 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001730 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1731 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001732 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001733 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001734 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001735
Victor Stinnera57dfd02014-05-14 17:13:14 +02001736 # test keywords
1737 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1738 '\xe4\xf6\xfc')
1739 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1740 '[]')
1741
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001742 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001743 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1744 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001745 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001746 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001747 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001748 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001749
Victor Stinnera57dfd02014-05-14 17:13:14 +02001750 # test keywords
1751 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1752 b'\xe4\xf6\xfc')
1753 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1754 b'[]')
1755
Walter Dörwald063e1e82004-10-28 13:04:26 +00001756 def test_register(self):
1757 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001758 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001759
1760 def test_lookup(self):
1761 self.assertRaises(TypeError, codecs.lookup)
1762 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001763 self.assertRaises(LookupError, codecs.lookup, " ")
1764
1765 def test_getencoder(self):
1766 self.assertRaises(TypeError, codecs.getencoder)
1767 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1768
1769 def test_getdecoder(self):
1770 self.assertRaises(TypeError, codecs.getdecoder)
1771 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1772
1773 def test_getreader(self):
1774 self.assertRaises(TypeError, codecs.getreader)
1775 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1776
1777 def test_getwriter(self):
1778 self.assertRaises(TypeError, codecs.getwriter)
1779 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001780
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001781 def test_lookup_issue1813(self):
1782 # Issue #1813: under Turkish locales, lookup of some codecs failed
1783 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001784 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001785 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1786 try:
1787 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1788 except locale.Error:
1789 # Unsupported locale on this system
1790 self.skipTest('test needs Turkish locale')
1791 c = codecs.lookup('ASCII')
1792 self.assertEqual(c.name, 'ascii')
1793
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001794 def test_all(self):
1795 api = (
1796 "encode", "decode",
1797 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1798 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1799 "getencoder", "getdecoder", "getincrementalencoder",
1800 "getincrementaldecoder", "getreader", "getwriter",
1801 "register_error", "lookup_error",
1802 "strict_errors", "replace_errors", "ignore_errors",
1803 "xmlcharrefreplace_errors", "backslashreplace_errors",
1804 "namereplace_errors",
1805 "open", "EncodedFile",
1806 "iterencode", "iterdecode",
1807 "BOM", "BOM_BE", "BOM_LE",
1808 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1809 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1810 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1811 "StreamReaderWriter", "StreamRecoder",
1812 )
1813 self.assertCountEqual(api, codecs.__all__)
1814 for api in codecs.__all__:
1815 getattr(codecs, api)
1816
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001817 def test_open(self):
1818 self.addCleanup(support.unlink, support.TESTFN)
1819 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1820 with self.subTest(mode), \
1821 codecs.open(support.TESTFN, mode, 'ascii') as file:
1822 self.assertIsInstance(file, codecs.StreamReaderWriter)
1823
1824 def test_undefined(self):
1825 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1826 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1827 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1828 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1829 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1830 self.assertRaises(UnicodeError,
1831 codecs.encode, 'abc', 'undefined', errors)
1832 self.assertRaises(UnicodeError,
1833 codecs.decode, b'abc', 'undefined', errors)
1834
Victor Stinnerf96418d2015-09-21 23:06:27 +02001835
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001836class StreamReaderTest(unittest.TestCase):
1837
1838 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001839 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001840 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001841
1842 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001843 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001844 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001845
Victor Stinnerf96418d2015-09-21 23:06:27 +02001846
Thomas Wouters89f507f2006-12-13 04:49:30 +00001847class EncodedFileTest(unittest.TestCase):
1848
1849 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001850 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001851 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001852 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001853
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001854 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001855 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001856 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001857 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001858
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001859all_unicode_encodings = [
1860 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001861 "big5",
1862 "big5hkscs",
1863 "charmap",
1864 "cp037",
1865 "cp1006",
1866 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001867 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001868 "cp1140",
1869 "cp1250",
1870 "cp1251",
1871 "cp1252",
1872 "cp1253",
1873 "cp1254",
1874 "cp1255",
1875 "cp1256",
1876 "cp1257",
1877 "cp1258",
1878 "cp424",
1879 "cp437",
1880 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001881 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001882 "cp737",
1883 "cp775",
1884 "cp850",
1885 "cp852",
1886 "cp855",
1887 "cp856",
1888 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001889 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001890 "cp860",
1891 "cp861",
1892 "cp862",
1893 "cp863",
1894 "cp864",
1895 "cp865",
1896 "cp866",
1897 "cp869",
1898 "cp874",
1899 "cp875",
1900 "cp932",
1901 "cp949",
1902 "cp950",
1903 "euc_jis_2004",
1904 "euc_jisx0213",
1905 "euc_jp",
1906 "euc_kr",
1907 "gb18030",
1908 "gb2312",
1909 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001910 "hp_roman8",
1911 "hz",
1912 "idna",
1913 "iso2022_jp",
1914 "iso2022_jp_1",
1915 "iso2022_jp_2",
1916 "iso2022_jp_2004",
1917 "iso2022_jp_3",
1918 "iso2022_jp_ext",
1919 "iso2022_kr",
1920 "iso8859_1",
1921 "iso8859_10",
1922 "iso8859_11",
1923 "iso8859_13",
1924 "iso8859_14",
1925 "iso8859_15",
1926 "iso8859_16",
1927 "iso8859_2",
1928 "iso8859_3",
1929 "iso8859_4",
1930 "iso8859_5",
1931 "iso8859_6",
1932 "iso8859_7",
1933 "iso8859_8",
1934 "iso8859_9",
1935 "johab",
1936 "koi8_r",
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03001937 "koi8_t",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001938 "koi8_u",
Serhiy Storchakaad8a1c32015-05-12 23:16:55 +03001939 "kz1048",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001940 "latin_1",
1941 "mac_cyrillic",
1942 "mac_greek",
1943 "mac_iceland",
1944 "mac_latin2",
1945 "mac_roman",
1946 "mac_turkish",
1947 "palmos",
1948 "ptcp154",
1949 "punycode",
1950 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001951 "shift_jis",
1952 "shift_jis_2004",
1953 "shift_jisx0213",
1954 "tis_620",
1955 "unicode_escape",
1956 "unicode_internal",
1957 "utf_16",
1958 "utf_16_be",
1959 "utf_16_le",
1960 "utf_7",
1961 "utf_8",
1962]
1963
1964if hasattr(codecs, "mbcs_encode"):
1965 all_unicode_encodings.append("mbcs")
Steve Dowerf5aba582016-09-06 19:42:27 -07001966if hasattr(codecs, "oem_encode"):
1967 all_unicode_encodings.append("oem")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001968
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001969# The following encoding is not tested, because it's not supposed
1970# to work:
1971# "undefined"
1972
1973# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001974broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001975 "punycode",
1976 "unicode_internal"
1977]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001978
Victor Stinnerf96418d2015-09-21 23:06:27 +02001979
Walter Dörwald3abcb012007-04-16 22:10:50 +00001980class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001981 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001982 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001983 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001984 name = codecs.lookup(encoding).name
1985 if encoding.endswith("_codec"):
1986 name += "_codec"
1987 elif encoding == "latin_1":
1988 name = "latin_1"
1989 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001990
Ezio Melottiadc417c2011-11-17 12:23:34 +02001991 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001992 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001993 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001994 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001995 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001996 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001997
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001998 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001999 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002000 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02002001 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002002 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002003 for c in s:
2004 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002005 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00002006 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00002007 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002008 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02002009 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002010 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002011 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002012 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002013 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002014 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002015
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002016 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002017 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00002018 try:
2019 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002020 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00002021 pass
2022 else:
2023 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002024 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00002025 for c in s:
2026 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002027 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00002028 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002029 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00002030 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002031 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002032 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002033 self.assertEqual(decodedresult, s,
2034 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00002035
2036 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002037 result = "".join(codecs.iterdecode(
2038 codecs.iterencode(s, encoding), encoding))
2039 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00002040
2041 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002042 result = "".join(codecs.iterdecode(
2043 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002044 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00002045
Victor Stinner554f3f02010-06-16 23:33:54 +00002046 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00002047 # check incremental decoder/encoder with errors argument
2048 try:
2049 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002050 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00002051 pass
2052 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002053 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002054 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002055 decodedresult = "".join(decoder.decode(bytes([c]))
2056 for c in encodedresult)
2057 self.assertEqual(decodedresult, s,
2058 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002059
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002060 @support.cpython_only
2061 def test_basics_capi(self):
2062 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
2063 s = "abc123" # all codecs should be able to encode these
2064 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002065 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002066 # check incremental decoder/encoder (fetched via the C API)
2067 try:
2068 cencoder = codec_incrementalencoder(encoding)
2069 except LookupError: # no IncrementalEncoder
2070 pass
2071 else:
2072 # check C API
2073 encodedresult = b""
2074 for c in s:
2075 encodedresult += cencoder.encode(c)
2076 encodedresult += cencoder.encode("", True)
2077 cdecoder = codec_incrementaldecoder(encoding)
2078 decodedresult = ""
2079 for c in encodedresult:
2080 decodedresult += cdecoder.decode(bytes([c]))
2081 decodedresult += cdecoder.decode(b"", True)
2082 self.assertEqual(decodedresult, s,
2083 "encoding=%r" % encoding)
2084
2085 if encoding not in ("idna", "mbcs"):
2086 # check incremental decoder/encoder with errors argument
2087 try:
2088 cencoder = codec_incrementalencoder(encoding, "ignore")
2089 except LookupError: # no IncrementalEncoder
2090 pass
2091 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002092 encodedresult = b"".join(cencoder.encode(c) for c in s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002093 cdecoder = codec_incrementaldecoder(encoding, "ignore")
2094 decodedresult = "".join(cdecoder.decode(bytes([c]))
2095 for c in encodedresult)
2096 self.assertEqual(decodedresult, s,
2097 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002098
Walter Dörwald729c31f2005-03-14 19:06:30 +00002099 def test_seek(self):
2100 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002101 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00002102 for encoding in all_unicode_encodings:
2103 if encoding == "idna": # FIXME: See SF bug #1163178
2104 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002105 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00002106 continue
Victor Stinner05010702011-05-27 16:50:40 +02002107 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00002108 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00002109 # Test that calling seek resets the internal codec state and buffers
2110 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002111 data = reader.read()
2112 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00002113
Walter Dörwalde22d3392005-11-17 08:52:34 +00002114 def test_bad_decode_args(self):
2115 for encoding in all_unicode_encodings:
2116 decoder = codecs.getdecoder(encoding)
2117 self.assertRaises(TypeError, decoder)
2118 if encoding not in ("idna", "punycode"):
2119 self.assertRaises(TypeError, decoder, 42)
2120
2121 def test_bad_encode_args(self):
2122 for encoding in all_unicode_encodings:
2123 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02002124 with support.check_warnings():
2125 # unicode-internal has been deprecated
2126 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002127
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002128 def test_encoding_map_type_initialized(self):
2129 from encodings import cp1140
2130 # This used to crash, we are only verifying there's no crash.
2131 table_type = type(cp1140.encoding_table)
2132 self.assertEqual(table_type, table_type)
2133
Walter Dörwald3abcb012007-04-16 22:10:50 +00002134 def test_decoder_state(self):
2135 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002136 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002137 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002138 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002139 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2140 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2141
Victor Stinnerf96418d2015-09-21 23:06:27 +02002142
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002143class CharmapTest(unittest.TestCase):
2144 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002145 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002146 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002147 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002148 )
2149
Ezio Melottib3aedd42010-11-20 19:04:17 +00002150 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002151 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2152 ("\U0010FFFFbc", 3)
2153 )
2154
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002155 self.assertRaises(UnicodeDecodeError,
2156 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2157 )
2158
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002159 self.assertRaises(UnicodeDecodeError,
2160 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2161 )
2162
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002163 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002164 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002165 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002166 )
2167
Ezio Melottib3aedd42010-11-20 19:04:17 +00002168 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002169 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002170 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002171 )
2172
Ezio Melottib3aedd42010-11-20 19:04:17 +00002173 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002174 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2175 ("ab\\x02", 3)
2176 )
2177
2178 self.assertEqual(
2179 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2180 ("ab\\x02", 3)
2181 )
2182
2183 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002184 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002185 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002186 )
2187
Ezio Melottib3aedd42010-11-20 19:04:17 +00002188 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002189 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002190 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002191 )
2192
Guido van Rossum805365e2007-05-07 22:24:25 +00002193 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002194 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002195 codecs.charmap_decode(allbytes, "ignore", ""),
2196 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002197 )
2198
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002199 def test_decode_with_int2str_map(self):
2200 self.assertEqual(
2201 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2202 {0: 'a', 1: 'b', 2: 'c'}),
2203 ("abc", 3)
2204 )
2205
2206 self.assertEqual(
2207 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2208 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2209 ("AaBbCc", 3)
2210 )
2211
2212 self.assertEqual(
2213 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2214 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2215 ("\U0010FFFFbc", 3)
2216 )
2217
2218 self.assertEqual(
2219 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2220 {0: 'a', 1: 'b', 2: ''}),
2221 ("ab", 3)
2222 )
2223
2224 self.assertRaises(UnicodeDecodeError,
2225 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2226 {0: 'a', 1: 'b'}
2227 )
2228
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002229 self.assertRaises(UnicodeDecodeError,
2230 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2231 {0: 'a', 1: 'b', 2: None}
2232 )
2233
2234 # Issue #14850
2235 self.assertRaises(UnicodeDecodeError,
2236 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2237 {0: 'a', 1: 'b', 2: '\ufffe'}
2238 )
2239
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002240 self.assertEqual(
2241 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2242 {0: 'a', 1: 'b'}),
2243 ("ab\ufffd", 3)
2244 )
2245
2246 self.assertEqual(
2247 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2248 {0: 'a', 1: 'b', 2: None}),
2249 ("ab\ufffd", 3)
2250 )
2251
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002252 # Issue #14850
2253 self.assertEqual(
2254 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2255 {0: 'a', 1: 'b', 2: '\ufffe'}),
2256 ("ab\ufffd", 3)
2257 )
2258
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002259 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002260 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2261 {0: 'a', 1: 'b'}),
2262 ("ab\\x02", 3)
2263 )
2264
2265 self.assertEqual(
2266 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2267 {0: 'a', 1: 'b', 2: None}),
2268 ("ab\\x02", 3)
2269 )
2270
2271 # Issue #14850
2272 self.assertEqual(
2273 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2274 {0: 'a', 1: 'b', 2: '\ufffe'}),
2275 ("ab\\x02", 3)
2276 )
2277
2278 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002279 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2280 {0: 'a', 1: 'b'}),
2281 ("ab", 3)
2282 )
2283
2284 self.assertEqual(
2285 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2286 {0: 'a', 1: 'b', 2: None}),
2287 ("ab", 3)
2288 )
2289
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002290 # Issue #14850
2291 self.assertEqual(
2292 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2293 {0: 'a', 1: 'b', 2: '\ufffe'}),
2294 ("ab", 3)
2295 )
2296
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002297 allbytes = bytes(range(256))
2298 self.assertEqual(
2299 codecs.charmap_decode(allbytes, "ignore", {}),
2300 ("", len(allbytes))
2301 )
2302
2303 def test_decode_with_int2int_map(self):
2304 a = ord('a')
2305 b = ord('b')
2306 c = ord('c')
2307
2308 self.assertEqual(
2309 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2310 {0: a, 1: b, 2: c}),
2311 ("abc", 3)
2312 )
2313
2314 # Issue #15379
2315 self.assertEqual(
2316 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2317 {0: 0x10FFFF, 1: b, 2: c}),
2318 ("\U0010FFFFbc", 3)
2319 )
2320
Antoine Pitroua1f76552012-09-23 20:00:04 +02002321 self.assertEqual(
2322 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2323 {0: sys.maxunicode, 1: b, 2: c}),
2324 (chr(sys.maxunicode) + "bc", 3)
2325 )
2326
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002327 self.assertRaises(TypeError,
2328 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002329 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002330 )
2331
2332 self.assertRaises(UnicodeDecodeError,
2333 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2334 {0: a, 1: b},
2335 )
2336
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002337 self.assertRaises(UnicodeDecodeError,
2338 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2339 {0: a, 1: b, 2: 0xFFFE},
2340 )
2341
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002342 self.assertEqual(
2343 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2344 {0: a, 1: b}),
2345 ("ab\ufffd", 3)
2346 )
2347
2348 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002349 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2350 {0: a, 1: b, 2: 0xFFFE}),
2351 ("ab\ufffd", 3)
2352 )
2353
2354 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002355 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2356 {0: a, 1: b}),
2357 ("ab\\x02", 3)
2358 )
2359
2360 self.assertEqual(
2361 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2362 {0: a, 1: b, 2: 0xFFFE}),
2363 ("ab\\x02", 3)
2364 )
2365
2366 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002367 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2368 {0: a, 1: b}),
2369 ("ab", 3)
2370 )
2371
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002372 self.assertEqual(
2373 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2374 {0: a, 1: b, 2: 0xFFFE}),
2375 ("ab", 3)
2376 )
2377
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002378
Thomas Wouters89f507f2006-12-13 04:49:30 +00002379class WithStmtTest(unittest.TestCase):
2380 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002381 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002382 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2383 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002384 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002385
2386 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002387 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002388 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002389 with codecs.StreamReaderWriter(f, info.streamreader,
2390 info.streamwriter, 'strict') as srw:
2391 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002392
Victor Stinnerf96418d2015-09-21 23:06:27 +02002393
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002394class TypesTest(unittest.TestCase):
2395 def test_decode_unicode(self):
2396 # Most decoders don't accept unicode input
2397 decoders = [
2398 codecs.utf_7_decode,
2399 codecs.utf_8_decode,
2400 codecs.utf_16_le_decode,
2401 codecs.utf_16_be_decode,
2402 codecs.utf_16_ex_decode,
2403 codecs.utf_32_decode,
2404 codecs.utf_32_le_decode,
2405 codecs.utf_32_be_decode,
2406 codecs.utf_32_ex_decode,
2407 codecs.latin_1_decode,
2408 codecs.ascii_decode,
2409 codecs.charmap_decode,
2410 ]
2411 if hasattr(codecs, "mbcs_decode"):
2412 decoders.append(codecs.mbcs_decode)
2413 for decoder in decoders:
2414 self.assertRaises(TypeError, decoder, "xxx")
2415
2416 def test_unicode_escape(self):
Martin Panter119e5022016-04-16 09:28:57 +00002417 # Escape-decoding a unicode string is supported and gives the same
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002418 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002419 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2420 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2421 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2422 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002423
Victor Stinnere3b47152011-12-09 20:49:49 +01002424 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2425 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002426 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2427 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002428
2429 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2430 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002431 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2432 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002433
Serhiy Storchakad6793772013-01-29 10:20:44 +02002434
2435class UnicodeEscapeTest(unittest.TestCase):
2436 def test_empty(self):
2437 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2438 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2439
2440 def test_raw_encode(self):
2441 encode = codecs.unicode_escape_encode
2442 for b in range(32, 127):
2443 if b != b'\\'[0]:
2444 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2445
2446 def test_raw_decode(self):
2447 decode = codecs.unicode_escape_decode
2448 for b in range(256):
2449 if b != b'\\'[0]:
2450 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2451
2452 def test_escape_encode(self):
2453 encode = codecs.unicode_escape_encode
2454 check = coding_checker(self, encode)
2455 check('\t', br'\t')
2456 check('\n', br'\n')
2457 check('\r', br'\r')
2458 check('\\', br'\\')
2459 for b in range(32):
2460 if chr(b) not in '\t\n\r':
2461 check(chr(b), ('\\x%02x' % b).encode())
2462 for b in range(127, 256):
2463 check(chr(b), ('\\x%02x' % b).encode())
2464 check('\u20ac', br'\u20ac')
2465 check('\U0001d120', br'\U0001d120')
2466
2467 def test_escape_decode(self):
2468 decode = codecs.unicode_escape_decode
2469 check = coding_checker(self, decode)
2470 check(b"[\\\n]", "[]")
2471 check(br'[\"]', '["]')
2472 check(br"[\']", "[']")
2473 check(br"[\\]", r"[\]")
2474 check(br"[\a]", "[\x07]")
2475 check(br"[\b]", "[\x08]")
2476 check(br"[\t]", "[\x09]")
2477 check(br"[\n]", "[\x0a]")
2478 check(br"[\v]", "[\x0b]")
2479 check(br"[\f]", "[\x0c]")
2480 check(br"[\r]", "[\x0d]")
2481 check(br"[\7]", "[\x07]")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002482 check(br"[\78]", "[\x078]")
2483 check(br"[\41]", "[!]")
2484 check(br"[\418]", "[!8]")
2485 check(br"[\101]", "[A]")
2486 check(br"[\1010]", "[A0]")
2487 check(br"[\x41]", "[A]")
2488 check(br"[\x410]", "[A0]")
2489 check(br"\u20ac", "\u20ac")
2490 check(br"\U0001d120", "\U0001d120")
R David Murray110b6fe2016-09-08 15:34:08 -04002491 for i in range(97, 123):
2492 b = bytes([i])
2493 if b not in b'abfnrtuvx':
2494 with self.assertWarns(DeprecationWarning):
2495 check(b"\\" + b, "\\" + chr(i))
2496 if b.upper() not in b'UN':
2497 with self.assertWarns(DeprecationWarning):
2498 check(b"\\" + b.upper(), "\\" + chr(i-32))
2499 with self.assertWarns(DeprecationWarning):
2500 check(br"\8", "\\8")
2501 with self.assertWarns(DeprecationWarning):
2502 check(br"\9", "\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03002503 with self.assertWarns(DeprecationWarning):
2504 check(b"\\\xfa", "\\\xfa")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002505
2506 def test_decode_errors(self):
2507 decode = codecs.unicode_escape_decode
2508 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2509 for i in range(d):
2510 self.assertRaises(UnicodeDecodeError, decode,
2511 b"\\" + c + b"0"*i)
2512 self.assertRaises(UnicodeDecodeError, decode,
2513 b"[\\" + c + b"0"*i + b"]")
2514 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2515 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2516 self.assertEqual(decode(data, "replace"),
2517 ("[\ufffd]\ufffd", len(data)))
2518 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2519 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2520 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2521
2522
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002523class RawUnicodeEscapeTest(unittest.TestCase):
2524 def test_empty(self):
2525 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2526 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2527
2528 def test_raw_encode(self):
2529 encode = codecs.raw_unicode_escape_encode
2530 for b in range(256):
2531 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2532
2533 def test_raw_decode(self):
2534 decode = codecs.raw_unicode_escape_decode
2535 for b in range(256):
2536 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2537
2538 def test_escape_encode(self):
2539 encode = codecs.raw_unicode_escape_encode
2540 check = coding_checker(self, encode)
2541 for b in range(256):
2542 if b not in b'uU':
2543 check('\\' + chr(b), b'\\' + bytes([b]))
2544 check('\u20ac', br'\u20ac')
2545 check('\U0001d120', br'\U0001d120')
2546
2547 def test_escape_decode(self):
2548 decode = codecs.raw_unicode_escape_decode
2549 check = coding_checker(self, decode)
2550 for b in range(256):
2551 if b not in b'uU':
2552 check(b'\\' + bytes([b]), '\\' + chr(b))
2553 check(br"\u20ac", "\u20ac")
2554 check(br"\U0001d120", "\U0001d120")
2555
2556 def test_decode_errors(self):
2557 decode = codecs.raw_unicode_escape_decode
2558 for c, d in (b'u', 4), (b'U', 4):
2559 for i in range(d):
2560 self.assertRaises(UnicodeDecodeError, decode,
2561 b"\\" + c + b"0"*i)
2562 self.assertRaises(UnicodeDecodeError, decode,
2563 b"[\\" + c + b"0"*i + b"]")
2564 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2565 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2566 self.assertEqual(decode(data, "replace"),
2567 ("[\ufffd]\ufffd", len(data)))
2568 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2569 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2570 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2571
2572
Berker Peksag4a72a7b2016-09-16 17:31:06 +03002573class EscapeEncodeTest(unittest.TestCase):
2574
2575 def test_escape_encode(self):
2576 tests = [
2577 (b'', (b'', 0)),
2578 (b'foobar', (b'foobar', 6)),
2579 (b'spam\0eggs', (b'spam\\x00eggs', 9)),
2580 (b'a\'b', (b"a\\'b", 3)),
2581 (b'b\\c', (b'b\\\\c', 3)),
2582 (b'c\nd', (b'c\\nd', 3)),
2583 (b'd\re', (b'd\\re', 3)),
2584 (b'f\x7fg', (b'f\\x7fg', 3)),
2585 ]
2586 for data, output in tests:
2587 with self.subTest(data=data):
2588 self.assertEqual(codecs.escape_encode(data), output)
2589 self.assertRaises(TypeError, codecs.escape_encode, 'spam')
2590 self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam'))
2591
2592
Martin v. Löwis43c57782009-05-10 08:15:24 +00002593class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002594
2595 def test_utf8(self):
2596 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002597 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002598 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002599 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002600 b"foo\x80bar")
2601 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002602 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002603 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002604 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002605 b"\xed\xb0\x80")
2606
2607 def test_ascii(self):
2608 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002609 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002610 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002611 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002612 b"foo\x80bar")
2613
2614 def test_charmap(self):
2615 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002616 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002617 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002618 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002619 b"foo\xa5bar")
2620
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002621 def test_latin1(self):
2622 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002623 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002624 b"\xe4\xeb\xef\xf6\xfc")
2625
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002626
Victor Stinner3fed0872010-05-22 02:16:27 +00002627class BomTest(unittest.TestCase):
2628 def test_seek0(self):
2629 data = "1234567890"
2630 tests = ("utf-16",
2631 "utf-16-le",
2632 "utf-16-be",
2633 "utf-32",
2634 "utf-32-le",
2635 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002636 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002637 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002638 # Check if the BOM is written only once
2639 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002640 f.write(data)
2641 f.write(data)
2642 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002643 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002644 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002645 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002646
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002647 # Check that the BOM is written after a seek(0)
2648 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2649 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002650 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002651 f.seek(0)
2652 f.write(data)
2653 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002654 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002655
2656 # (StreamWriter) Check that the BOM is written after a seek(0)
2657 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002658 f.writer.write(data[0])
2659 self.assertNotEqual(f.writer.tell(), 0)
2660 f.writer.seek(0)
2661 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002662 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002663 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002664
Victor Stinner05010702011-05-27 16:50:40 +02002665 # Check that the BOM is not written after a seek() at a position
2666 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002667 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2668 f.write(data)
2669 f.seek(f.tell())
2670 f.write(data)
2671 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002672 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002673
Victor Stinner05010702011-05-27 16:50:40 +02002674 # (StreamWriter) Check that the BOM is not written after a seek()
2675 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002676 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002677 f.writer.write(data)
2678 f.writer.seek(f.writer.tell())
2679 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002680 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002681 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002682
Victor Stinner3fed0872010-05-22 02:16:27 +00002683
Georg Brandl02524622010-12-02 18:06:51 +00002684bytes_transform_encodings = [
2685 "base64_codec",
2686 "uu_codec",
2687 "quopri_codec",
2688 "hex_codec",
2689]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002690
2691transform_aliases = {
2692 "base64_codec": ["base64", "base_64"],
2693 "uu_codec": ["uu"],
2694 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2695 "hex_codec": ["hex"],
2696 "rot_13": ["rot13"],
2697}
2698
Georg Brandl02524622010-12-02 18:06:51 +00002699try:
2700 import zlib
2701except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002702 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002703else:
2704 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002705 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002706try:
2707 import bz2
2708except ImportError:
2709 pass
2710else:
2711 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002712 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002713
Victor Stinnerf96418d2015-09-21 23:06:27 +02002714
Georg Brandl02524622010-12-02 18:06:51 +00002715class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002716
Georg Brandl02524622010-12-02 18:06:51 +00002717 def test_basics(self):
2718 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002719 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002720 with self.subTest(encoding=encoding):
2721 # generic codecs interface
2722 (o, size) = codecs.getencoder(encoding)(binput)
2723 self.assertEqual(size, len(binput))
2724 (i, size) = codecs.getdecoder(encoding)(o)
2725 self.assertEqual(size, len(o))
2726 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002727
Georg Brandl02524622010-12-02 18:06:51 +00002728 def test_read(self):
2729 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002730 with self.subTest(encoding=encoding):
2731 sin = codecs.encode(b"\x80", encoding)
2732 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2733 sout = reader.read()
2734 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002735
2736 def test_readline(self):
2737 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002738 with self.subTest(encoding=encoding):
2739 sin = codecs.encode(b"\x80", encoding)
2740 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2741 sout = reader.readline()
2742 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002743
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002744 def test_buffer_api_usage(self):
2745 # We check all the transform codecs accept memoryview input
2746 # for encoding and decoding
2747 # and also that they roundtrip correctly
2748 original = b"12345\x80"
2749 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002750 with self.subTest(encoding=encoding):
2751 data = original
2752 view = memoryview(data)
2753 data = codecs.encode(data, encoding)
2754 view_encoded = codecs.encode(view, encoding)
2755 self.assertEqual(view_encoded, data)
2756 view = memoryview(data)
2757 data = codecs.decode(data, encoding)
2758 self.assertEqual(data, original)
2759 view_decoded = codecs.decode(view, encoding)
2760 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002761
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002762 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002763 # Check binary -> binary codecs give a good error for str input
2764 bad_input = "bad input type"
2765 for encoding in bytes_transform_encodings:
2766 with self.subTest(encoding=encoding):
R David Murray44b548d2016-09-08 13:59:53 -04002767 fmt = (r"{!r} is not a text encoding; "
2768 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002769 msg = fmt.format(encoding)
2770 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002771 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002772 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002773
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002774 def test_text_to_binary_blacklists_text_transforms(self):
2775 # Check str.encode gives a good error message for str -> str codecs
2776 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002777 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002778 with self.assertRaisesRegex(LookupError, msg):
2779 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002780
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002781 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002782 # Check bytes.decode and bytearray.decode give a good error
2783 # message for binary -> binary codecs
2784 data = b"encode first to ensure we meet any format restrictions"
2785 for encoding in bytes_transform_encodings:
2786 with self.subTest(encoding=encoding):
2787 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002788 fmt = (r"{!r} is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002789 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002790 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002791 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002792 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002793 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002794 bytearray(encoded_data).decode(encoding)
2795
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002796 def test_binary_to_text_blacklists_text_transforms(self):
2797 # Check str -> str codec gives a good error for binary input
2798 for bad_input in (b"immutable", bytearray(b"mutable")):
2799 with self.subTest(bad_input=bad_input):
2800 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002801 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002802 with self.assertRaisesRegex(LookupError, msg) as failure:
2803 bad_input.decode("rot_13")
2804 self.assertIsNone(failure.exception.__cause__)
2805
Zachary Wareefa2e042013-12-30 14:54:11 -06002806 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002807 def test_custom_zlib_error_is_wrapped(self):
2808 # Check zlib codec gives a good error for malformed input
2809 msg = "^decoding with 'zlib_codec' codec failed"
2810 with self.assertRaisesRegex(Exception, msg) as failure:
2811 codecs.decode(b"hello", "zlib_codec")
2812 self.assertIsInstance(failure.exception.__cause__,
2813 type(failure.exception))
2814
2815 def test_custom_hex_error_is_wrapped(self):
2816 # Check hex codec gives a good error for malformed input
2817 msg = "^decoding with 'hex_codec' codec failed"
2818 with self.assertRaisesRegex(Exception, msg) as failure:
2819 codecs.decode(b"hello", "hex_codec")
2820 self.assertIsInstance(failure.exception.__cause__,
2821 type(failure.exception))
2822
2823 # Unfortunately, the bz2 module throws OSError, which the codec
2824 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002825
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002826 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2827 def test_aliases(self):
2828 for codec_name, aliases in transform_aliases.items():
2829 expected_name = codecs.lookup(codec_name).name
2830 for alias in aliases:
2831 with self.subTest(alias=alias):
2832 info = codecs.lookup(alias)
2833 self.assertEqual(info.name, expected_name)
2834
Martin Panter06171bd2015-09-12 00:34:28 +00002835 def test_quopri_stateless(self):
2836 # Should encode with quotetabs=True
2837 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2838 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2839 # But should still support unescaped tabs and spaces
2840 unescaped = b"space tab eol\n"
2841 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2842
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002843 def test_uu_invalid(self):
2844 # Missing "begin" line
2845 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2846
Nick Coghlan8b097b42013-11-13 23:49:21 +10002847
2848# The codec system tries to wrap exceptions in order to ensure the error
2849# mentions the operation being performed and the codec involved. We
2850# currently *only* want this to happen for relatively stateless
2851# exceptions, where the only significant information they contain is their
2852# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002853
2854# Use a local codec registry to avoid appearing to leak objects when
Martin Panter119e5022016-04-16 09:28:57 +00002855# registering multiple search functions
Nick Coghlan4e553e22013-11-16 00:35:34 +10002856_TEST_CODECS = {}
2857
2858def _get_test_codec(codec_name):
2859 return _TEST_CODECS.get(codec_name)
2860codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2861
Nick Coghlan8fad1672014-09-15 23:50:44 +12002862try:
2863 # Issue #22166: Also need to clear the internal cache in CPython
2864 from _codecs import _forget_codec
2865except ImportError:
2866 def _forget_codec(codec_name):
2867 pass
2868
2869
Nick Coghlan8b097b42013-11-13 23:49:21 +10002870class ExceptionChainingTest(unittest.TestCase):
2871
2872 def setUp(self):
2873 # There's no way to unregister a codec search function, so we just
2874 # ensure we render this one fairly harmless after the test
2875 # case finishes by using the test case repr as the codec name
2876 # The codecs module normalizes codec names, although this doesn't
2877 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002878 # We also make sure we use a truly unique id for the custom codec
2879 # to avoid issues with the codec cache when running these tests
2880 # multiple times (e.g. when hunting for refleaks)
2881 unique_id = repr(self) + str(id(self))
2882 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2883
2884 # We store the object to raise on the instance because of a bad
2885 # interaction between the codec caching (which means we can't
2886 # recreate the codec entry) and regrtest refleak hunting (which
2887 # runs the same test instance multiple times). This means we
2888 # need to ensure the codecs call back in to the instance to find
2889 # out which exception to raise rather than binding them in a
2890 # closure to an object that may change on the next run
2891 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002892
Nick Coghlan4e553e22013-11-16 00:35:34 +10002893 def tearDown(self):
2894 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002895 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2896 encodings._cache.pop(self.codec_name, None)
2897 try:
2898 _forget_codec(self.codec_name)
2899 except KeyError:
2900 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002901
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002902 def set_codec(self, encode, decode):
2903 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002904 name=self.codec_name)
2905 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002906
2907 @contextlib.contextmanager
2908 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002909 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002910 operation, self.codec_name, exc_type.__name__, msg)
2911 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2912 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002913 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002914 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002915
2916 def raise_obj(self, *args, **kwds):
2917 # Helper to dynamically change the object raised by a test codec
2918 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002919
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002920 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002921 self.obj_to_raise = obj_to_raise
2922 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002923 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002924 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002925 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002926 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002927 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002928 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002929 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002930 codecs.decode(b"bytes input", self.codec_name)
2931
2932 def test_raise_by_type(self):
2933 self.check_wrapped(RuntimeError, "")
2934
2935 def test_raise_by_value(self):
2936 msg = "This should be wrapped"
2937 self.check_wrapped(RuntimeError(msg), msg)
2938
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002939 def test_raise_grandchild_subclass_exact_size(self):
2940 msg = "This should be wrapped"
2941 class MyRuntimeError(RuntimeError):
2942 __slots__ = ()
2943 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2944
2945 def test_raise_subclass_with_weakref_support(self):
2946 msg = "This should be wrapped"
2947 class MyRuntimeError(RuntimeError):
2948 pass
2949 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2950
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002951 def check_not_wrapped(self, obj_to_raise, msg):
2952 def raise_obj(*args, **kwds):
2953 raise obj_to_raise
2954 self.set_codec(raise_obj, raise_obj)
2955 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002956 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002957 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002958 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002959 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002960 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002961 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002962 codecs.decode(b"bytes input", self.codec_name)
2963
2964 def test_init_override_is_not_wrapped(self):
2965 class CustomInit(RuntimeError):
2966 def __init__(self):
2967 pass
2968 self.check_not_wrapped(CustomInit, "")
2969
2970 def test_new_override_is_not_wrapped(self):
2971 class CustomNew(RuntimeError):
2972 def __new__(cls):
2973 return super().__new__(cls)
2974 self.check_not_wrapped(CustomNew, "")
2975
2976 def test_instance_attribute_is_not_wrapped(self):
2977 msg = "This should NOT be wrapped"
2978 exc = RuntimeError(msg)
2979 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002980 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002981
2982 def test_non_str_arg_is_not_wrapped(self):
2983 self.check_not_wrapped(RuntimeError(1), "1")
2984
2985 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002986 msg_re = r"^\('a', 'b', 'c'\)$"
2987 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002988
2989 # http://bugs.python.org/issue19609
2990 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002991 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002992 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002993 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002994 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002995 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002996 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002997 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002998 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002999 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10003000 codecs.decode(b"bytes input", self.codec_name)
3001
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003002 def test_unflagged_non_text_codec_handling(self):
3003 # The stdlib non-text codecs are now marked so they're
3004 # pre-emptively skipped by the text model related methods
3005 # However, third party codecs won't be flagged, so we still make
3006 # sure the case where an inappropriate output type is produced is
3007 # handled appropriately
3008 def encode_to_str(*args, **kwds):
3009 return "not bytes!", 0
3010 def decode_to_bytes(*args, **kwds):
3011 return b"not str!", 0
3012 self.set_codec(encode_to_str, decode_to_bytes)
3013 # No input or output type checks on the codecs module functions
3014 encoded = codecs.encode(None, self.codec_name)
3015 self.assertEqual(encoded, "not bytes!")
3016 decoded = codecs.decode(None, self.codec_name)
3017 self.assertEqual(decoded, b"not str!")
3018 # Text model methods should complain
3019 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
R David Murray44b548d2016-09-08 13:59:53 -04003020 r"use codecs.encode\(\) to encode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003021 msg = fmt.format(self.codec_name)
3022 with self.assertRaisesRegex(TypeError, msg):
3023 "str_input".encode(self.codec_name)
3024 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
R David Murray44b548d2016-09-08 13:59:53 -04003025 r"use codecs.decode\(\) to decode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003026 msg = fmt.format(self.codec_name)
3027 with self.assertRaisesRegex(TypeError, msg):
3028 b"bytes input".decode(self.codec_name)
3029
Nick Coghlanfdf239a2013-10-03 00:43:22 +10003030
Georg Brandl02524622010-12-02 18:06:51 +00003031
Victor Stinner62be4fb2011-10-18 21:46:37 +02003032@unittest.skipUnless(sys.platform == 'win32',
3033 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02003034class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003035 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02003036 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02003037
Victor Stinner3a50e702011-10-18 21:21:00 +02003038 def test_invalid_code_page(self):
3039 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
3040 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02003041 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
3042 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02003043
3044 def test_code_page_name(self):
3045 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
3046 codecs.code_page_encode, 932, '\xff')
3047 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01003048 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003049 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01003050 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003051
3052 def check_decode(self, cp, tests):
3053 for raw, errors, expected in tests:
3054 if expected is not None:
3055 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01003056 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003057 except UnicodeDecodeError as err:
3058 self.fail('Unable to decode %a from "cp%s" with '
3059 'errors=%r: %s' % (raw, cp, errors, err))
3060 self.assertEqual(decoded[0], expected,
3061 '%a.decode("cp%s", %r)=%a != %a'
3062 % (raw, cp, errors, decoded[0], expected))
3063 # assert 0 <= decoded[1] <= len(raw)
3064 self.assertGreaterEqual(decoded[1], 0)
3065 self.assertLessEqual(decoded[1], len(raw))
3066 else:
3067 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01003068 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003069
3070 def check_encode(self, cp, tests):
3071 for text, errors, expected in tests:
3072 if expected is not None:
3073 try:
3074 encoded = codecs.code_page_encode(cp, text, errors)
3075 except UnicodeEncodeError as err:
3076 self.fail('Unable to encode %a to "cp%s" with '
3077 'errors=%r: %s' % (text, cp, errors, err))
3078 self.assertEqual(encoded[0], expected,
3079 '%a.encode("cp%s", %r)=%a != %a'
3080 % (text, cp, errors, encoded[0], expected))
3081 self.assertEqual(encoded[1], len(text))
3082 else:
3083 self.assertRaises(UnicodeEncodeError,
3084 codecs.code_page_encode, cp, text, errors)
3085
3086 def test_cp932(self):
3087 self.check_encode(932, (
3088 ('abc', 'strict', b'abc'),
3089 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003090 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003091 ('\xff', 'strict', None),
3092 ('[\xff]', 'ignore', b'[]'),
3093 ('[\xff]', 'replace', b'[y]'),
3094 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003095 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02003096 ('[\xff]', 'namereplace',
3097 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003098 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003099 ('\udcff', 'strict', None),
3100 ('[\udcff]', 'surrogateescape', b'[\xff]'),
3101 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003102 ))
Victor Stinner9e921882011-10-18 21:55:25 +02003103 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02003104 (b'abc', 'strict', 'abc'),
3105 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
3106 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003107 (b'[\xff]', 'strict', None),
3108 (b'[\xff]', 'ignore', '[]'),
3109 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02003110 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003111 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003112 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003113 (b'\x81\x00abc', 'strict', None),
3114 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003115 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01003116 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003117 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003118
3119 def test_cp1252(self):
3120 self.check_encode(1252, (
3121 ('abc', 'strict', b'abc'),
3122 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
3123 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003124 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003125 ('\u0141', 'strict', None),
3126 ('\u0141', 'ignore', b''),
3127 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003128 ('\udc98', 'surrogateescape', b'\x98'),
3129 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003130 ))
3131 self.check_decode(1252, (
3132 (b'abc', 'strict', 'abc'),
3133 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3134 (b'\xff', 'strict', '\xff'),
3135 ))
3136
3137 def test_cp_utf7(self):
3138 cp = 65000
3139 self.check_encode(cp, (
3140 ('abc', 'strict', b'abc'),
3141 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
3142 ('\U0010ffff', 'strict', b'+2//f/w-'),
3143 ('\udc80', 'strict', b'+3IA-'),
3144 ('\ufffd', 'strict', b'+//0-'),
3145 ))
3146 self.check_decode(cp, (
3147 (b'abc', 'strict', 'abc'),
3148 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3149 (b'+2//f/w-', 'strict', '\U0010ffff'),
3150 (b'+3IA-', 'strict', '\udc80'),
3151 (b'+//0-', 'strict', '\ufffd'),
3152 # invalid bytes
3153 (b'[+/]', 'strict', '[]'),
3154 (b'[\xff]', 'strict', '[\xff]'),
3155 ))
3156
Victor Stinner3a50e702011-10-18 21:21:00 +02003157 def test_multibyte_encoding(self):
3158 self.check_decode(932, (
3159 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3160 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3161 ))
3162 self.check_decode(self.CP_UTF8, (
3163 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3164 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3165 ))
Steve Dowerf5aba582016-09-06 19:42:27 -07003166 self.check_encode(self.CP_UTF8, (
3167 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3168 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3169 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003170
Miss Islington (bot)74829b72019-03-20 21:31:57 -07003171 def test_code_page_decode_flags(self):
3172 # Issue #36312: For some code pages (e.g. UTF-7) flags for
3173 # MultiByteToWideChar() must be set to 0.
3174 for cp in (50220, 50221, 50222, 50225, 50227, 50229,
3175 *range(57002, 57011+1), 65000):
3176 self.assertEqual(codecs.code_page_decode(cp, b'abc'), ('abc', 3))
3177 self.assertEqual(codecs.code_page_decode(42, b'abc'),
3178 ('\uf061\uf062\uf063', 3))
3179
Victor Stinner3a50e702011-10-18 21:21:00 +02003180 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003181 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3182 self.assertEqual(decoded, ('', 0))
3183
Victor Stinner3a50e702011-10-18 21:21:00 +02003184 decoded = codecs.code_page_decode(932,
3185 b'\xe9\x80\xe9', 'strict',
3186 False)
3187 self.assertEqual(decoded, ('\u9a3e', 2))
3188
3189 decoded = codecs.code_page_decode(932,
3190 b'\xe9\x80\xe9\x80', 'strict',
3191 False)
3192 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3193
3194 decoded = codecs.code_page_decode(932,
3195 b'abc', 'strict',
3196 False)
3197 self.assertEqual(decoded, ('abc', 3))
3198
Steve Dowerf5aba582016-09-06 19:42:27 -07003199 def test_mbcs_alias(self):
3200 # Check that looking up our 'default' codepage will return
3201 # mbcs when we don't have a more specific one available
Victor Stinner91106cd2017-12-13 12:29:09 +01003202 with mock.patch('_winapi.GetACP', return_value=123):
Steve Dowerf5aba582016-09-06 19:42:27 -07003203 codec = codecs.lookup('cp123')
3204 self.assertEqual(codec.name, 'mbcs')
Steve Dowerf5aba582016-09-06 19:42:27 -07003205
Miss Islington (bot)bdeb56c2018-12-03 01:09:11 -08003206 @support.bigmemtest(size=2**31, memuse=7, dry_run=False)
3207 def test_large_input(self):
3208 # Test input longer than INT_MAX.
3209 # Input should contain undecodable bytes before and after
3210 # the INT_MAX limit.
3211 encoded = (b'01234567' * (2**28-1) +
3212 b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff')
3213 self.assertEqual(len(encoded), 2**31+2)
3214 decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True)
3215 self.assertEqual(decoded[1], len(encoded))
3216 del encoded
3217 self.assertEqual(len(decoded[0]), decoded[1])
3218 self.assertEqual(decoded[0][:10], '0123456701')
3219 self.assertEqual(decoded[0][-20:],
3220 '6701234567'
3221 '\udc85\udc86\udcea\udceb\udcec'
3222 '\udcef\udcfc\udcfd\udcfe\udcff')
3223
Victor Stinner3a50e702011-10-18 21:21:00 +02003224
Victor Stinnerf96418d2015-09-21 23:06:27 +02003225class ASCIITest(unittest.TestCase):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003226 def test_encode(self):
3227 self.assertEqual('abc123'.encode('ascii'), b'abc123')
3228
3229 def test_encode_error(self):
3230 for data, error_handler, expected in (
3231 ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3232 ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3233 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003234 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3235 b'[\\x80\\xff\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003236 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3237 ):
3238 with self.subTest(data=data, error_handler=error_handler,
3239 expected=expected):
3240 self.assertEqual(data.encode('ascii', error_handler),
3241 expected)
3242
3243 def test_encode_surrogateescape_error(self):
3244 with self.assertRaises(UnicodeEncodeError):
3245 # the first character can be decoded, but not the second
3246 '\udc80\xff'.encode('ascii', 'surrogateescape')
3247
Victor Stinnerf96418d2015-09-21 23:06:27 +02003248 def test_decode(self):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003249 self.assertEqual(b'abc'.decode('ascii'), 'abc')
3250
3251 def test_decode_error(self):
Victor Stinnerf96418d2015-09-21 23:06:27 +02003252 for data, error_handler, expected in (
3253 (b'[\x80\xff]', 'ignore', '[]'),
3254 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3255 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3256 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3257 ):
3258 with self.subTest(data=data, error_handler=error_handler,
3259 expected=expected):
3260 self.assertEqual(data.decode('ascii', error_handler),
3261 expected)
3262
3263
Victor Stinnerc3713e92015-09-29 12:32:13 +02003264class Latin1Test(unittest.TestCase):
3265 def test_encode(self):
3266 for data, expected in (
3267 ('abc', b'abc'),
3268 ('\x80\xe9\xff', b'\x80\xe9\xff'),
3269 ):
3270 with self.subTest(data=data, expected=expected):
3271 self.assertEqual(data.encode('latin1'), expected)
3272
3273 def test_encode_errors(self):
3274 for data, error_handler, expected in (
3275 ('[\u20ac\udc80]', 'ignore', b'[]'),
3276 ('[\u20ac\udc80]', 'replace', b'[??]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003277 ('[\u20ac\U000abcde]', 'backslashreplace',
3278 b'[\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003279 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3280 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3281 ):
3282 with self.subTest(data=data, error_handler=error_handler,
3283 expected=expected):
3284 self.assertEqual(data.encode('latin1', error_handler),
3285 expected)
3286
3287 def test_encode_surrogateescape_error(self):
3288 with self.assertRaises(UnicodeEncodeError):
3289 # the first character can be decoded, but not the second
3290 '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3291
3292 def test_decode(self):
3293 for data, expected in (
3294 (b'abc', 'abc'),
3295 (b'[\x80\xff]', '[\x80\xff]'),
3296 ):
3297 with self.subTest(data=data, expected=expected):
3298 self.assertEqual(data.decode('latin1'), expected)
3299
3300
Fred Drake2e2be372001-09-20 21:33:42 +00003301if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003302 unittest.main()