blob: 893212e243eff4d66a8541c4afbe2ea3f3d59eec [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
Nick Coghlanc72e4e62013-11-22 22:39:36 +10007import encodings
Victor Stinner91106cd2017-12-13 12:29:09 +01008from unittest import mock
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Antoine Pitrou00b2c862011-10-05 13:01:41 +020012try:
Victor Stinner3d4226a2018-08-29 22:21:32 +020013 import _testcapi
14except ImportError as exc:
15 _testcapi = None
16
17try:
Antoine Pitrou00b2c862011-10-05 13:01:41 +020018 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Victor Stinnerf96418d2015-09-21 23:06:27 +020030
Walter Dörwald69652032004-09-07 20:24:22 +000031class Queue(object):
32 """
33 queue: write bytes at one end, read bytes from the other end
34 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000035 def __init__(self, buffer):
36 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000037
38 def write(self, chars):
39 self._buffer += chars
40
41 def read(self, size=-1):
42 if size<0:
43 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000044 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000045 return s
46 else:
47 s = self._buffer[:size]
48 self._buffer = self._buffer[size:]
49 return s
50
Victor Stinnerf96418d2015-09-21 23:06:27 +020051
Walter Dörwald3abcb012007-04-16 22:10:50 +000052class MixInCheckStateHandling:
53 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000054 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000055 d = codecs.getincrementaldecoder(encoding)()
56 part1 = d.decode(s[:i])
57 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000058 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000059 # Check that the condition stated in the documentation for
60 # IncrementalDecoder.getstate() holds
61 if not state[1]:
62 # reset decoder to the default state without anything buffered
63 d.setstate((state[0][:0], 0))
64 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000065 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000066 # The decoder must return to the same state
67 self.assertEqual(state, d.getstate())
68 # Create a new decoder and set it to the state
69 # we extracted from the old one
70 d = codecs.getincrementaldecoder(encoding)()
71 d.setstate(state)
72 part2 = d.decode(s[i:], True)
73 self.assertEqual(u, part1+part2)
74
75 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000076 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000077 d = codecs.getincrementalencoder(encoding)()
78 part1 = d.encode(u[:i])
79 state = d.getstate()
80 d = codecs.getincrementalencoder(encoding)()
81 d.setstate(state)
82 part2 = d.encode(u[i:], True)
83 self.assertEqual(s, part1+part2)
84
Victor Stinnerf96418d2015-09-21 23:06:27 +020085
Ezio Melotti5d3dba02013-01-11 06:02:07 +020086class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000087 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000088 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000089 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000090 # the StreamReader and check that the results equal the appropriate
91 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000092 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020093 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000094 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000095 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000096 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000097 result += r.read()
98 self.assertEqual(result, partialresult)
99 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000100 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +0000102
Martin Panter7462b6492015-11-02 03:37:02 +0000103 # do the check again, this time using an incremental decoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000104 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000105 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000106 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000107 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000108 self.assertEqual(result, partialresult)
109 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000110 self.assertEqual(d.decode(b"", True), "")
111 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000112
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000113 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000114 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000115 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000116 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000117 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000118 self.assertEqual(result, partialresult)
119 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000120 self.assertEqual(d.decode(b"", True), "")
121 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000122
123 # check iterdecode()
124 encoded = input.encode(self.encoding)
125 self.assertEqual(
126 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000127 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000128 )
129
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000130 def test_readline(self):
131 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000132 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000133 return codecs.getreader(self.encoding)(stream)
134
Walter Dörwaldca199432006-03-06 22:39:12 +0000135 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200136 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000137 lines = []
138 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000139 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000140 if not line:
141 break
142 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000143 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000144
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000145 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
146 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
147 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000148 self.assertEqual(readalllines(s, True), sexpected)
149 self.assertEqual(readalllines(s, False), sexpectednoends)
150 self.assertEqual(readalllines(s, True, 10), sexpected)
151 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000152
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200153 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000154 # Test long lines (multiple calls to read() in readline())
155 vw = []
156 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200157 for (i, lineend) in enumerate(lineends):
158 vw.append((i*200+200)*"\u3042" + lineend)
159 vwo.append((i*200+200)*"\u3042")
160 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
161 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000162
163 # Test lines where the first read might end with \r, so the
164 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000165 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200166 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000167 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000168 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000169 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 self.assertEqual(
171 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000172 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000173 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200174 self.assertEqual(
175 reader.readline(keepends=True),
176 "xxx\n",
177 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000178 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000179 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000180 self.assertEqual(
181 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000182 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000183 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200184 self.assertEqual(
185 reader.readline(keepends=False),
186 "xxx",
187 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000188
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200189 def test_mixed_readline_and_read(self):
190 lines = ["Humpty Dumpty sat on a wall,\n",
191 "Humpty Dumpty had a great fall.\r\n",
192 "All the king's horses and all the king's men\r",
193 "Couldn't put Humpty together again."]
194 data = ''.join(lines)
195 def getreader():
196 stream = io.BytesIO(data.encode(self.encoding))
197 return codecs.getreader(self.encoding)(stream)
198
199 # Issue #8260: Test readline() followed by read()
200 f = getreader()
201 self.assertEqual(f.readline(), lines[0])
202 self.assertEqual(f.read(), ''.join(lines[1:]))
203 self.assertEqual(f.read(), '')
204
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200205 # Issue #32110: Test readline() followed by read(n)
206 f = getreader()
207 self.assertEqual(f.readline(), lines[0])
208 self.assertEqual(f.read(1), lines[1][0])
209 self.assertEqual(f.read(0), '')
210 self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100])
211
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200212 # Issue #16636: Test readline() followed by readlines()
213 f = getreader()
214 self.assertEqual(f.readline(), lines[0])
215 self.assertEqual(f.readlines(), lines[1:])
216 self.assertEqual(f.read(), '')
217
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200218 # Test read(n) followed by read()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200219 f = getreader()
220 self.assertEqual(f.read(size=40, chars=5), data[:5])
221 self.assertEqual(f.read(), data[5:])
222 self.assertEqual(f.read(), '')
223
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200224 # Issue #32110: Test read(n) followed by read(n)
225 f = getreader()
226 self.assertEqual(f.read(size=40, chars=5), data[:5])
227 self.assertEqual(f.read(1), data[5])
228 self.assertEqual(f.read(0), '')
229 self.assertEqual(f.read(100), data[6:106])
230
231 # Issue #12446: Test read(n) followed by readlines()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200232 f = getreader()
233 self.assertEqual(f.read(size=40, chars=5), data[:5])
234 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
235 self.assertEqual(f.read(), '')
236
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000237 def test_bug1175396(self):
238 s = [
239 '<%!--===================================================\r\n',
240 ' BLOG index page: show recent articles,\r\n',
241 ' today\'s articles, or articles of a specific date.\r\n',
242 '========================================================--%>\r\n',
243 '<%@inputencoding="ISO-8859-1"%>\r\n',
244 '<%@pagetemplate=TEMPLATE.y%>\r\n',
245 '<%@import=import frog.util, frog%>\r\n',
246 '<%@import=import frog.objects%>\r\n',
247 '<%@import=from frog.storageerrors import StorageError%>\r\n',
248 '<%\r\n',
249 '\r\n',
250 'import logging\r\n',
251 'log=logging.getLogger("Snakelets.logger")\r\n',
252 '\r\n',
253 '\r\n',
254 'user=self.SessionCtx.user\r\n',
255 'storageEngine=self.SessionCtx.storageEngine\r\n',
256 '\r\n',
257 '\r\n',
258 'def readArticlesFromDate(date, count=None):\r\n',
259 ' entryids=storageEngine.listBlogEntries(date)\r\n',
260 ' entryids.reverse() # descending\r\n',
261 ' if count:\r\n',
262 ' entryids=entryids[:count]\r\n',
263 ' try:\r\n',
264 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
265 ' except StorageError,x:\r\n',
266 ' log.error("Error loading articles: "+str(x))\r\n',
267 ' self.abort("cannot load articles")\r\n',
268 '\r\n',
269 'showdate=None\r\n',
270 '\r\n',
271 'arg=self.Request.getArg()\r\n',
272 'if arg=="today":\r\n',
273 ' #-------------------- TODAY\'S ARTICLES\r\n',
274 ' self.write("<h2>Today\'s articles</h2>")\r\n',
275 ' showdate = frog.util.isodatestr() \r\n',
276 ' entries = readArticlesFromDate(showdate)\r\n',
277 'elif arg=="active":\r\n',
278 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
279 ' self.Yredirect("active.y")\r\n',
280 'elif arg=="login":\r\n',
281 ' #-------------------- LOGIN PAGE redirect\r\n',
282 ' self.Yredirect("login.y")\r\n',
283 'elif arg=="date":\r\n',
284 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
285 ' showdate = self.Request.getParameter("date")\r\n',
286 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
287 ' entries = readArticlesFromDate(showdate)\r\n',
288 'else:\r\n',
289 ' #-------------------- RECENT ARTICLES\r\n',
290 ' self.write("<h2>Recent articles</h2>")\r\n',
291 ' dates=storageEngine.listBlogEntryDates()\r\n',
292 ' if dates:\r\n',
293 ' entries=[]\r\n',
294 ' SHOWAMOUNT=10\r\n',
295 ' for showdate in dates:\r\n',
296 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
297 ' if len(entries)>=SHOWAMOUNT:\r\n',
298 ' break\r\n',
299 ' \r\n',
300 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000301 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200302 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000303 for (i, line) in enumerate(reader):
304 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000305
306 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000307 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200308 writer = codecs.getwriter(self.encoding)(q)
309 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000310
311 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000312 writer.write("foo\r")
313 self.assertEqual(reader.readline(keepends=False), "foo")
314 writer.write("\nbar\r")
315 self.assertEqual(reader.readline(keepends=False), "")
316 self.assertEqual(reader.readline(keepends=False), "bar")
317 writer.write("baz")
318 self.assertEqual(reader.readline(keepends=False), "baz")
319 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000320
321 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000322 writer.write("foo\r")
323 self.assertEqual(reader.readline(keepends=True), "foo\r")
324 writer.write("\nbar\r")
325 self.assertEqual(reader.readline(keepends=True), "\n")
326 self.assertEqual(reader.readline(keepends=True), "bar\r")
327 writer.write("baz")
328 self.assertEqual(reader.readline(keepends=True), "baz")
329 self.assertEqual(reader.readline(keepends=True), "")
330 writer.write("foo\r\n")
331 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000332
Walter Dörwald9fa09462005-01-10 12:01:39 +0000333 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000334 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
335 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
336 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000337
338 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000339 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200340 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000341 self.assertEqual(reader.readline(), s1)
342 self.assertEqual(reader.readline(), s2)
343 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000344 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000345
346 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000347 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
348 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
349 s3 = "stillokay:bbbbxx\r\n"
350 s4 = "broken!!!!badbad\r\n"
351 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000352
353 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000354 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200355 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000356 self.assertEqual(reader.readline(), s1)
357 self.assertEqual(reader.readline(), s2)
358 self.assertEqual(reader.readline(), s3)
359 self.assertEqual(reader.readline(), s4)
360 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000361 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000362
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200363 ill_formed_sequence_replace = "\ufffd"
364
365 def test_lone_surrogates(self):
366 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
367 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
368 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200369 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
370 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200371 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
372 "[&#56448;]".encode(self.encoding))
373 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
374 "[]".encode(self.encoding))
375 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
376 "[?]".encode(self.encoding))
377
Victor Stinner01ada392015-10-01 21:54:51 +0200378 # sequential surrogate characters
379 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
380 "[]".encode(self.encoding))
381 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
382 "[??]".encode(self.encoding))
383
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200384 bom = "".encode(self.encoding)
385 for before, after in [("\U00010fff", "A"), ("[", "]"),
386 ("A", "\U00010fff")]:
387 before_sequence = before.encode(self.encoding)[len(bom):]
388 after_sequence = after.encode(self.encoding)[len(bom):]
389 test_string = before + "\uDC80" + after
390 test_sequence = (bom + before_sequence +
391 self.ill_formed_sequence + after_sequence)
392 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
393 self.encoding)
394 self.assertEqual(test_string.encode(self.encoding,
395 "surrogatepass"),
396 test_sequence)
397 self.assertEqual(test_sequence.decode(self.encoding,
398 "surrogatepass"),
399 test_string)
400 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
401 before + after)
402 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
403 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200404 backslashreplace = ''.join('\\x%02x' % b
405 for b in self.ill_formed_sequence)
406 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
407 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200408
Victor Stinnerf96418d2015-09-21 23:06:27 +0200409
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200410class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000411 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200412 if sys.byteorder == 'little':
413 ill_formed_sequence = b"\x80\xdc\x00\x00"
414 else:
415 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000416
417 spamle = (b'\xff\xfe\x00\x00'
418 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
419 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
420 spambe = (b'\x00\x00\xfe\xff'
421 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
422 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
423
424 def test_only_one_bom(self):
425 _,_,reader,writer = codecs.lookup(self.encoding)
426 # encode some stream
427 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200428 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000429 f.write("spam")
430 f.write("spam")
431 d = s.getvalue()
432 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000433 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000434 # try to read it back
435 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200436 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000437 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000438
439 def test_badbom(self):
440 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200441 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000442 self.assertRaises(UnicodeError, f.read)
443
444 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200445 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000446 self.assertRaises(UnicodeError, f.read)
447
448 def test_partial(self):
449 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200450 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000451 [
452 "", # first byte of BOM read
453 "", # second byte of BOM read
454 "", # third byte of BOM read
455 "", # fourth byte of BOM read => byteorder known
456 "",
457 "",
458 "",
459 "\x00",
460 "\x00",
461 "\x00",
462 "\x00",
463 "\x00\xff",
464 "\x00\xff",
465 "\x00\xff",
466 "\x00\xff",
467 "\x00\xff\u0100",
468 "\x00\xff\u0100",
469 "\x00\xff\u0100",
470 "\x00\xff\u0100",
471 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200472 "\x00\xff\u0100\uffff",
473 "\x00\xff\u0100\uffff",
474 "\x00\xff\u0100\uffff",
475 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000476 ]
477 )
478
Georg Brandl791f4e12009-09-17 11:41:24 +0000479 def test_handlers(self):
480 self.assertEqual(('\ufffd', 1),
481 codecs.utf_32_decode(b'\x01', 'replace', True))
482 self.assertEqual(('', 1),
483 codecs.utf_32_decode(b'\x01', 'ignore', True))
484
Walter Dörwald41980ca2007-08-16 21:55:45 +0000485 def test_errors(self):
486 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
487 b"\xff", "strict", True)
488
489 def test_decoder_state(self):
490 self.check_state_handling_decode(self.encoding,
491 "spamspam", self.spamle)
492 self.check_state_handling_decode(self.encoding,
493 "spamspam", self.spambe)
494
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000495 def test_issue8941(self):
496 # Issue #8941: insufficient result allocation when decoding into
497 # surrogate pairs on UCS-2 builds.
498 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
499 self.assertEqual('\U00010000' * 1024,
500 codecs.utf_32_decode(encoded_le)[0])
501 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
502 self.assertEqual('\U00010000' * 1024,
503 codecs.utf_32_decode(encoded_be)[0])
504
Victor Stinnerf96418d2015-09-21 23:06:27 +0200505
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200506class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000507 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200508 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000509
510 def test_partial(self):
511 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200512 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000513 [
514 "",
515 "",
516 "",
517 "\x00",
518 "\x00",
519 "\x00",
520 "\x00",
521 "\x00\xff",
522 "\x00\xff",
523 "\x00\xff",
524 "\x00\xff",
525 "\x00\xff\u0100",
526 "\x00\xff\u0100",
527 "\x00\xff\u0100",
528 "\x00\xff\u0100",
529 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200530 "\x00\xff\u0100\uffff",
531 "\x00\xff\u0100\uffff",
532 "\x00\xff\u0100\uffff",
533 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000534 ]
535 )
536
537 def test_simple(self):
538 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
539
540 def test_errors(self):
541 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
542 b"\xff", "strict", True)
543
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000544 def test_issue8941(self):
545 # Issue #8941: insufficient result allocation when decoding into
546 # surrogate pairs on UCS-2 builds.
547 encoded = b'\x00\x00\x01\x00' * 1024
548 self.assertEqual('\U00010000' * 1024,
549 codecs.utf_32_le_decode(encoded)[0])
550
Victor Stinnerf96418d2015-09-21 23:06:27 +0200551
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200552class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000553 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200554 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000555
556 def test_partial(self):
557 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200558 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000559 [
560 "",
561 "",
562 "",
563 "\x00",
564 "\x00",
565 "\x00",
566 "\x00",
567 "\x00\xff",
568 "\x00\xff",
569 "\x00\xff",
570 "\x00\xff",
571 "\x00\xff\u0100",
572 "\x00\xff\u0100",
573 "\x00\xff\u0100",
574 "\x00\xff\u0100",
575 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200576 "\x00\xff\u0100\uffff",
577 "\x00\xff\u0100\uffff",
578 "\x00\xff\u0100\uffff",
579 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000580 ]
581 )
582
583 def test_simple(self):
584 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
585
586 def test_errors(self):
587 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
588 b"\xff", "strict", True)
589
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000590 def test_issue8941(self):
591 # Issue #8941: insufficient result allocation when decoding into
592 # surrogate pairs on UCS-2 builds.
593 encoded = b'\x00\x01\x00\x00' * 1024
594 self.assertEqual('\U00010000' * 1024,
595 codecs.utf_32_be_decode(encoded)[0])
596
597
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200598class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000599 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200600 if sys.byteorder == 'little':
601 ill_formed_sequence = b"\x80\xdc"
602 else:
603 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000604
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000605 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
606 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000607
608 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000609 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000610 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000611 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200612 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000613 f.write("spam")
614 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000615 d = s.getvalue()
616 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000617 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000618 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000619 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200620 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000621 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000622
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000623 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000624 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200625 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000626 self.assertRaises(UnicodeError, f.read)
627
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000628 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200629 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000630 self.assertRaises(UnicodeError, f.read)
631
Walter Dörwald69652032004-09-07 20:24:22 +0000632 def test_partial(self):
633 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200634 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000635 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000636 "", # first byte of BOM read
637 "", # second byte of BOM read => byteorder known
638 "",
639 "\x00",
640 "\x00",
641 "\x00\xff",
642 "\x00\xff",
643 "\x00\xff\u0100",
644 "\x00\xff\u0100",
645 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200646 "\x00\xff\u0100\uffff",
647 "\x00\xff\u0100\uffff",
648 "\x00\xff\u0100\uffff",
649 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000650 ]
651 )
652
Georg Brandl791f4e12009-09-17 11:41:24 +0000653 def test_handlers(self):
654 self.assertEqual(('\ufffd', 1),
655 codecs.utf_16_decode(b'\x01', 'replace', True))
656 self.assertEqual(('', 1),
657 codecs.utf_16_decode(b'\x01', 'ignore', True))
658
Walter Dörwalde22d3392005-11-17 08:52:34 +0000659 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000660 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000661 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000662
663 def test_decoder_state(self):
664 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000665 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000666 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000667 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000668
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000669 def test_bug691291(self):
670 # Files are always opened in binary mode, even if no binary mode was
671 # specified. This means that no automatic conversion of '\n' is done
672 # on reading and writing.
673 s1 = 'Hello\r\nworld\r\n'
674
675 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200676 self.addCleanup(support.unlink, support.TESTFN)
677 with open(support.TESTFN, 'wb') as fp:
678 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200679 with support.check_warnings(('', DeprecationWarning)):
680 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
681 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200682 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000683
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200684class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000685 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200686 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000687
688 def test_partial(self):
689 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200690 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000691 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000692 "",
693 "\x00",
694 "\x00",
695 "\x00\xff",
696 "\x00\xff",
697 "\x00\xff\u0100",
698 "\x00\xff\u0100",
699 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200700 "\x00\xff\u0100\uffff",
701 "\x00\xff\u0100\uffff",
702 "\x00\xff\u0100\uffff",
703 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000704 ]
705 )
706
Walter Dörwalde22d3392005-11-17 08:52:34 +0000707 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200708 tests = [
709 (b'\xff', '\ufffd'),
710 (b'A\x00Z', 'A\ufffd'),
711 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
712 (b'\x00\xd8', '\ufffd'),
713 (b'\x00\xd8A', '\ufffd'),
714 (b'\x00\xd8A\x00', '\ufffdA'),
715 (b'\x00\xdcA\x00', '\ufffdA'),
716 ]
717 for raw, expected in tests:
718 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
719 raw, 'strict', True)
720 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000721
Victor Stinner53a9dd72010-12-08 22:25:45 +0000722 def test_nonbmp(self):
723 self.assertEqual("\U00010203".encode(self.encoding),
724 b'\x00\xd8\x03\xde')
725 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
726 "\U00010203")
727
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200728class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000729 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200730 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000731
732 def test_partial(self):
733 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200734 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000735 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000736 "",
737 "\x00",
738 "\x00",
739 "\x00\xff",
740 "\x00\xff",
741 "\x00\xff\u0100",
742 "\x00\xff\u0100",
743 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200744 "\x00\xff\u0100\uffff",
745 "\x00\xff\u0100\uffff",
746 "\x00\xff\u0100\uffff",
747 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000748 ]
749 )
750
Walter Dörwalde22d3392005-11-17 08:52:34 +0000751 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200752 tests = [
753 (b'\xff', '\ufffd'),
754 (b'\x00A\xff', 'A\ufffd'),
755 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
756 (b'\xd8\x00', '\ufffd'),
757 (b'\xd8\x00\xdc', '\ufffd'),
758 (b'\xd8\x00\x00A', '\ufffdA'),
759 (b'\xdc\x00\x00A', '\ufffdA'),
760 ]
761 for raw, expected in tests:
762 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
763 raw, 'strict', True)
764 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000765
Victor Stinner53a9dd72010-12-08 22:25:45 +0000766 def test_nonbmp(self):
767 self.assertEqual("\U00010203".encode(self.encoding),
768 b'\xd8\x00\xde\x03')
769 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
770 "\U00010203")
771
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200772class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000773 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200774 ill_formed_sequence = b"\xed\xb2\x80"
775 ill_formed_sequence_replace = "\ufffd" * 3
Victor Stinner01ada392015-10-01 21:54:51 +0200776 BOM = b''
Walter Dörwald69652032004-09-07 20:24:22 +0000777
778 def test_partial(self):
779 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200780 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000781 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000782 "\x00",
783 "\x00",
784 "\x00\xff",
785 "\x00\xff",
786 "\x00\xff\u07ff",
787 "\x00\xff\u07ff",
788 "\x00\xff\u07ff",
789 "\x00\xff\u07ff\u0800",
790 "\x00\xff\u07ff\u0800",
791 "\x00\xff\u07ff\u0800",
792 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200793 "\x00\xff\u07ff\u0800\uffff",
794 "\x00\xff\u07ff\u0800\uffff",
795 "\x00\xff\u07ff\u0800\uffff",
796 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000797 ]
798 )
799
Walter Dörwald3abcb012007-04-16 22:10:50 +0000800 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000801 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000802 self.check_state_handling_decode(self.encoding,
803 u, u.encode(self.encoding))
804
Victor Stinner1d65d912015-10-05 13:43:50 +0200805 def test_decode_error(self):
806 for data, error_handler, expected in (
807 (b'[\x80\xff]', 'ignore', '[]'),
808 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
809 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
810 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
811 ):
812 with self.subTest(data=data, error_handler=error_handler,
813 expected=expected):
814 self.assertEqual(data.decode(self.encoding, error_handler),
815 expected)
816
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000817 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200818 super().test_lone_surrogates()
819 # not sure if this is making sense for
820 # UTF-16 and UTF-32
Victor Stinner01ada392015-10-01 21:54:51 +0200821 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
822 self.BOM + b'[\x80]')
823
824 with self.assertRaises(UnicodeEncodeError) as cm:
825 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
826 exc = cm.exception
827 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000828
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000829 def test_surrogatepass_handler(self):
Victor Stinner01ada392015-10-01 21:54:51 +0200830 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
831 self.BOM + b"abc\xed\xa0\x80def")
832 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
833 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
834 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
835 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
836
837 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
Ezio Melottib3aedd42010-11-20 19:04:17 +0000838 "abc\ud800def")
Victor Stinner01ada392015-10-01 21:54:51 +0200839 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840 "\U00010fff\uD800")
Victor Stinner01ada392015-10-01 21:54:51 +0200841
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000842 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700843 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200844 b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200845 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200846 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000847
Victor Stinnerf96418d2015-09-21 23:06:27 +0200848
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200849@unittest.skipUnless(sys.platform == 'win32',
850 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200851class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200852 encoding = "cp65001"
853
854 def test_encode(self):
855 tests = [
856 ('abc', 'strict', b'abc'),
857 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
858 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
Steve Dowerf5aba582016-09-06 19:42:27 -0700859 ('\udc80', 'strict', None),
860 ('\udc80', 'ignore', b''),
861 ('\udc80', 'replace', b'?'),
862 ('\udc80', 'backslashreplace', b'\\udc80'),
863 ('\udc80', 'namereplace', b'\\udc80'),
864 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200865 ]
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200866 for text, errors, expected in tests:
867 if expected is not None:
868 try:
869 encoded = text.encode('cp65001', errors)
870 except UnicodeEncodeError as err:
871 self.fail('Unable to encode %a to cp65001 with '
872 'errors=%r: %s' % (text, errors, err))
873 self.assertEqual(encoded, expected,
874 '%a.encode("cp65001", %r)=%a != %a'
875 % (text, errors, encoded, expected))
876 else:
877 self.assertRaises(UnicodeEncodeError,
878 text.encode, "cp65001", errors)
879
880 def test_decode(self):
881 tests = [
882 (b'abc', 'strict', 'abc'),
883 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
884 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
885 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
886 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
887 # invalid bytes
888 (b'[\xff]', 'strict', None),
889 (b'[\xff]', 'ignore', '[]'),
890 (b'[\xff]', 'replace', '[\ufffd]'),
891 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Steve Dowerf5aba582016-09-06 19:42:27 -0700892 (b'[\xed\xb2\x80]', 'strict', None),
893 (b'[\xed\xb2\x80]', 'ignore', '[]'),
894 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200895 ]
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200896 for raw, errors, expected in tests:
897 if expected is not None:
898 try:
899 decoded = raw.decode('cp65001', errors)
900 except UnicodeDecodeError as err:
901 self.fail('Unable to decode %a from cp65001 with '
902 'errors=%r: %s' % (raw, errors, err))
903 self.assertEqual(decoded, expected,
904 '%a.decode("cp65001", %r)=%a != %a'
905 % (raw, errors, decoded, expected))
906 else:
907 self.assertRaises(UnicodeDecodeError,
908 raw.decode, 'cp65001', errors)
909
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200910 def test_lone_surrogates(self):
911 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
912 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
913 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
914 b'[\\udc80]')
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200915 self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
916 b'[\\udc80]')
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200917 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
918 b'[&#56448;]')
919 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
920 b'[\x80]')
921 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
922 b'[]')
923 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
924 b'[?]')
925
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200926 def test_surrogatepass_handler(self):
927 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
928 b"abc\xed\xa0\x80def")
929 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
930 "abc\ud800def")
931 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
932 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
933 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
934 "\U00010fff\uD800")
935 self.assertTrue(codecs.lookup_error("surrogatepass"))
936
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200937
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200938class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000939 encoding = "utf-7"
940
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300941 def test_ascii(self):
942 # Set D (directly encoded characters)
943 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
944 'abcdefghijklmnopqrstuvwxyz'
945 '0123456789'
946 '\'(),-./:?')
947 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
948 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
949 # Set O (optional direct characters)
950 set_o = ' !"#$%&*;<=>@[]^_`{|}'
951 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
952 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
953 # +
954 self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
955 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
956 # White spaces
957 ws = ' \t\n\r'
958 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
959 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
960 # Other ASCII characters
961 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
962 set(set_d + set_o + '+' + ws)))
963 self.assertEqual(other_ascii.encode(self.encoding),
964 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
965 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
966
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000967 def test_partial(self):
968 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200969 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000970 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200971 'a',
972 'a',
973 'a+',
974 'a+-',
975 'a+-b',
976 'a+-b',
977 'a+-b',
978 'a+-b',
979 'a+-b',
980 'a+-b\x00',
981 'a+-b\x00c',
982 'a+-b\x00c',
983 'a+-b\x00c',
984 'a+-b\x00c',
985 'a+-b\x00c',
986 'a+-b\x00c\x80',
987 'a+-b\x00c\x80d',
988 'a+-b\x00c\x80d',
989 'a+-b\x00c\x80d',
990 'a+-b\x00c\x80d',
991 'a+-b\x00c\x80d',
992 'a+-b\x00c\x80d\u0100',
993 'a+-b\x00c\x80d\u0100e',
994 'a+-b\x00c\x80d\u0100e',
995 'a+-b\x00c\x80d\u0100e',
996 'a+-b\x00c\x80d\u0100e',
997 'a+-b\x00c\x80d\u0100e',
998 'a+-b\x00c\x80d\u0100e',
999 'a+-b\x00c\x80d\u0100e',
1000 'a+-b\x00c\x80d\u0100e',
1001 'a+-b\x00c\x80d\u0100e\U00010000',
1002 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001003 ]
1004 )
Walter Dörwalde22d3392005-11-17 08:52:34 +00001005
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001006 def test_errors(self):
1007 tests = [
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001008 (b'\xffb', '\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001009 (b'a\xffb', 'a\ufffdb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001010 (b'a\xff\xffb', 'a\ufffd\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001011 (b'a+IK', 'a\ufffd'),
1012 (b'a+IK-b', 'a\ufffdb'),
1013 (b'a+IK,b', 'a\ufffdb'),
1014 (b'a+IKx', 'a\u20ac\ufffd'),
1015 (b'a+IKx-b', 'a\u20ac\ufffdb'),
1016 (b'a+IKwgr', 'a\u20ac\ufffd'),
1017 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
1018 (b'a+IKwgr,', 'a\u20ac\ufffd'),
1019 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
1020 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
1021 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
1022 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
1023 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
1024 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
1025 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001026 (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
1027 (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
Zackery Spytze349bf22018-08-18 22:43:38 -06001028 (b'a+@b', 'a\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001029 ]
1030 for raw, expected in tests:
1031 with self.subTest(raw=raw):
1032 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
1033 raw, 'strict', True)
1034 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
1035
1036 def test_nonbmp(self):
1037 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1038 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1039 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001040 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1041 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1042 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1043 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1044 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1045 b'+IKwgrNgB3KA-')
1046 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1047 '\u20ac\u20ac\U000104A0')
1048 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1049 '\u20ac\u20ac\U000104A0')
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001050
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001051 def test_lone_surrogates(self):
1052 tests = [
1053 (b'a+2AE-b', 'a\ud801b'),
1054 (b'a+2AE\xffb', 'a\ufffdb'),
1055 (b'a+2AE', 'a\ufffd'),
1056 (b'a+2AEA-b', 'a\ufffdb'),
1057 (b'a+2AH-b', 'a\ufffdb'),
1058 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1059 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1060 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1061 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1062 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1063 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1064 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1065 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1066 ]
1067 for raw, expected in tests:
1068 with self.subTest(raw=raw):
1069 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001070
1071
Walter Dörwalde22d3392005-11-17 08:52:34 +00001072class UTF16ExTest(unittest.TestCase):
1073
1074 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001075 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001076
1077 def test_bad_args(self):
1078 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1079
1080class ReadBufferTest(unittest.TestCase):
1081
1082 def test_array(self):
1083 import array
1084 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001085 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +00001086 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001087 )
1088
1089 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +00001090 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +00001091
1092 def test_bad_args(self):
1093 self.assertRaises(TypeError, codecs.readbuffer_encode)
1094 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1095
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001096class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001097 encoding = "utf-8-sig"
Victor Stinner01ada392015-10-01 21:54:51 +02001098 BOM = codecs.BOM_UTF8
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001099
1100 def test_partial(self):
1101 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001102 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001103 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001104 "",
1105 "",
1106 "", # First BOM has been read and skipped
1107 "",
1108 "",
1109 "\ufeff", # Second BOM has been read and emitted
1110 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001111 "\ufeff\x00", # First byte of encoded "\xff" read
1112 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1113 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1114 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001115 "\ufeff\x00\xff\u07ff",
1116 "\ufeff\x00\xff\u07ff",
1117 "\ufeff\x00\xff\u07ff\u0800",
1118 "\ufeff\x00\xff\u07ff\u0800",
1119 "\ufeff\x00\xff\u07ff\u0800",
1120 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001121 "\ufeff\x00\xff\u07ff\u0800\uffff",
1122 "\ufeff\x00\xff\u07ff\u0800\uffff",
1123 "\ufeff\x00\xff\u07ff\u0800\uffff",
1124 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001125 ]
1126 )
1127
Thomas Wouters89f507f2006-12-13 04:49:30 +00001128 def test_bug1601501(self):
1129 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001130 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001131
Walter Dörwald3abcb012007-04-16 22:10:50 +00001132 def test_bom(self):
1133 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001134 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001135 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1136
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001137 def test_stream_bom(self):
1138 unistring = "ABC\u00A1\u2200XYZ"
1139 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1140
1141 reader = codecs.getreader("utf-8-sig")
1142 for sizehint in [None] + list(range(1, 11)) + \
1143 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001144 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001145 ostream = io.StringIO()
1146 while 1:
1147 if sizehint is not None:
1148 data = istream.read(sizehint)
1149 else:
1150 data = istream.read()
1151
1152 if not data:
1153 break
1154 ostream.write(data)
1155
1156 got = ostream.getvalue()
1157 self.assertEqual(got, unistring)
1158
1159 def test_stream_bare(self):
1160 unistring = "ABC\u00A1\u2200XYZ"
1161 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1162
1163 reader = codecs.getreader("utf-8-sig")
1164 for sizehint in [None] + list(range(1, 11)) + \
1165 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001166 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001167 ostream = io.StringIO()
1168 while 1:
1169 if sizehint is not None:
1170 data = istream.read(sizehint)
1171 else:
1172 data = istream.read()
1173
1174 if not data:
1175 break
1176 ostream.write(data)
1177
1178 got = ostream.getvalue()
1179 self.assertEqual(got, unistring)
1180
1181class EscapeDecodeTest(unittest.TestCase):
1182 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001183 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001184 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001185
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001186 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001187 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001188 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001189 b = bytes([b])
1190 if b != b'\\':
1191 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001192
1193 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001194 decode = codecs.escape_decode
1195 check = coding_checker(self, decode)
1196 check(b"[\\\n]", b"[]")
1197 check(br'[\"]', b'["]')
1198 check(br"[\']", b"[']")
R David Murray110b6fe2016-09-08 15:34:08 -04001199 check(br"[\\]", b"[\\]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001200 check(br"[\a]", b"[\x07]")
1201 check(br"[\b]", b"[\x08]")
1202 check(br"[\t]", b"[\x09]")
1203 check(br"[\n]", b"[\x0a]")
1204 check(br"[\v]", b"[\x0b]")
1205 check(br"[\f]", b"[\x0c]")
1206 check(br"[\r]", b"[\x0d]")
1207 check(br"[\7]", b"[\x07]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001208 check(br"[\78]", b"[\x078]")
1209 check(br"[\41]", b"[!]")
1210 check(br"[\418]", b"[!8]")
1211 check(br"[\101]", b"[A]")
1212 check(br"[\1010]", b"[A0]")
1213 check(br"[\501]", b"[A]")
1214 check(br"[\x41]", b"[A]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001215 check(br"[\x410]", b"[A0]")
R David Murray110b6fe2016-09-08 15:34:08 -04001216 for i in range(97, 123):
1217 b = bytes([i])
1218 if b not in b'abfnrtvx':
1219 with self.assertWarns(DeprecationWarning):
1220 check(b"\\" + b, b"\\" + b)
1221 with self.assertWarns(DeprecationWarning):
1222 check(b"\\" + b.upper(), b"\\" + b.upper())
1223 with self.assertWarns(DeprecationWarning):
1224 check(br"\8", b"\\8")
1225 with self.assertWarns(DeprecationWarning):
1226 check(br"\9", b"\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03001227 with self.assertWarns(DeprecationWarning):
1228 check(b"\\\xfa", b"\\\xfa")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001229
1230 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001231 decode = codecs.escape_decode
1232 self.assertRaises(ValueError, decode, br"\x")
1233 self.assertRaises(ValueError, decode, br"[\x]")
1234 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1235 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1236 self.assertRaises(ValueError, decode, br"\x0")
1237 self.assertRaises(ValueError, decode, br"[\x0]")
1238 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1239 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001240
Victor Stinnerf96418d2015-09-21 23:06:27 +02001241
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001242class RecodingTest(unittest.TestCase):
1243 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001244 f = io.BytesIO()
Serhiy Storchaka5b10b982019-03-05 10:06:26 +02001245 with codecs.EncodedFile(f, "unicode_internal", "utf-8") as f2:
1246 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001247 # Python used to crash on this at exit because of a refcount
1248 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001249
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001250 self.assertTrue(f.closed)
1251
Martin v. Löwis2548c732003-04-18 10:39:54 +00001252# From RFC 3492
1253punycode_testcases = [
1254 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001255 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1256 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001257 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001258 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001259 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001260 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001261 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001262 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001263 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001264 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001265 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1266 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1267 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001268 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001269 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001270 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1271 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1272 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001273 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001274 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001275 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001276 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1277 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1278 "\u0939\u0948\u0902",
1279 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001280
1281 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001282 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001283 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1284 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001285
1286 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001287 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1288 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1289 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001290 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1291 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001292
1293 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001294 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1295 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1296 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1297 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001298 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001299
1300 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001301 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1302 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1303 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1304 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1305 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001306 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001307
1308 # (K) Vietnamese:
1309 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1310 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001311 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1312 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1313 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1314 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001315 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001316
Martin v. Löwis2548c732003-04-18 10:39:54 +00001317 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001318 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001319 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001320
Martin v. Löwis2548c732003-04-18 10:39:54 +00001321 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001322 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1323 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1324 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001325 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001326
1327 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001328 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1329 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1330 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001331 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001332
1333 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001334 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001335 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001336
1337 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001338 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1339 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001340 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001341
1342 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001343 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001344 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001345
1346 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001347 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001348 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001349
1350 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001351 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1352 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001353 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001354 ]
1355
1356for i in punycode_testcases:
1357 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001358 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001359
Victor Stinnerf96418d2015-09-21 23:06:27 +02001360
Martin v. Löwis2548c732003-04-18 10:39:54 +00001361class PunycodeTest(unittest.TestCase):
1362 def test_encode(self):
1363 for uni, puny in punycode_testcases:
1364 # Need to convert both strings to lower case, since
1365 # some of the extended encodings use upper case, but our
1366 # code produces only lower case. Converting just puny to
1367 # lower is also insufficient, since some of the input characters
1368 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001369 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001370 str(uni.encode("punycode"), "ascii").lower(),
1371 str(puny, "ascii").lower()
1372 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001373
1374 def test_decode(self):
1375 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001376 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001377 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001378 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001379
Victor Stinnerf96418d2015-09-21 23:06:27 +02001380
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001381class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001382 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001383 def test_bug1251300(self):
1384 # Decoding with unicode_internal used to not correctly handle "code
1385 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001386 ok = [
1387 (b"\x00\x10\xff\xff", "\U0010ffff"),
1388 (b"\x00\x00\x01\x01", "\U00000101"),
1389 (b"", ""),
1390 ]
1391 not_ok = [
1392 b"\x7f\xff\xff\xff",
1393 b"\x80\x00\x00\x00",
1394 b"\x81\x00\x00\x00",
1395 b"\x00",
1396 b"\x00\x00\x00\x00\x00",
1397 ]
1398 for internal, uni in ok:
1399 if sys.byteorder == "little":
1400 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001401 with support.check_warnings():
1402 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001403 for internal in not_ok:
1404 if sys.byteorder == "little":
1405 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001406 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001407 'deprecated', DeprecationWarning)):
1408 self.assertRaises(UnicodeDecodeError, internal.decode,
1409 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001410 if sys.byteorder == "little":
1411 invalid = b"\x00\x00\x11\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001412 invalid_backslashreplace = r"\x00\x00\x11\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001413 else:
1414 invalid = b"\x00\x11\x00\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001415 invalid_backslashreplace = r"\x00\x11\x00\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001416 with support.check_warnings():
1417 self.assertRaises(UnicodeDecodeError,
1418 invalid.decode, "unicode_internal")
1419 with support.check_warnings():
1420 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1421 '\ufffd')
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001422 with support.check_warnings():
1423 self.assertEqual(invalid.decode("unicode_internal", "backslashreplace"),
1424 invalid_backslashreplace)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001425
Victor Stinner182d90d2011-09-29 19:53:55 +02001426 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001427 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001428 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001429 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001430 'deprecated', DeprecationWarning)):
1431 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001432 except UnicodeDecodeError as ex:
1433 self.assertEqual("unicode_internal", ex.encoding)
1434 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1435 self.assertEqual(4, ex.start)
1436 self.assertEqual(8, ex.end)
1437 else:
1438 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001439
Victor Stinner182d90d2011-09-29 19:53:55 +02001440 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001441 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001442 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1443 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001444 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001445 'deprecated', DeprecationWarning)):
1446 ab = "ab".encode("unicode_internal").decode()
1447 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1448 "ascii"),
1449 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001450 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001451
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001452 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001453 with support.check_warnings(('unicode_internal codec has been '
1454 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001455 # Issue 3739
1456 encoder = codecs.getencoder("unicode_internal")
1457 self.assertEqual(encoder("a")[1], 1)
1458 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1459
1460 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001461
Martin v. Löwis2548c732003-04-18 10:39:54 +00001462# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1463nameprep_tests = [
1464 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001465 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1466 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1467 b'\xb8\x8f\xef\xbb\xbf',
1468 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001469 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001470 (b'CAFE',
1471 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001472 # 3.3 Case folding 8bit U+00DF (german sharp s).
1473 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001474 (b'\xc3\x9f',
1475 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001476 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001477 (b'\xc4\xb0',
1478 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001479 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001480 (b'\xc5\x83\xcd\xba',
1481 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001482 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1483 # XXX: skip this as it fails in UCS-2 mode
1484 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1485 # 'telc\xe2\x88\x95kg\xcf\x83'),
1486 (None, None),
1487 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001488 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1489 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001490 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001491 (b'\xe1\xbe\xb7',
1492 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001493 # 3.9 Self-reverting case folding U+01F0 and normalization.
1494 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001495 (b'\xc7\xb0',
1496 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001497 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001498 (b'\xce\x90',
1499 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001500 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001501 (b'\xce\xb0',
1502 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001503 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001504 (b'\xe1\xba\x96',
1505 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001506 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001507 (b'\xe1\xbd\x96',
1508 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001509 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001510 (b' ',
1511 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001512 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001513 (b'\xc2\xa0',
1514 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001515 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001516 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001517 None),
1518 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001519 (b'\xe2\x80\x80',
1520 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001521 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001522 (b'\xe2\x80\x8b',
1523 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001524 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001525 (b'\xe3\x80\x80',
1526 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001527 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001528 (b'\x10\x7f',
1529 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001530 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001531 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001532 None),
1533 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001534 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001535 None),
1536 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001537 (b'\xef\xbb\xbf',
1538 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001539 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001540 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001541 None),
1542 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001543 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001544 None),
1545 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001546 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001547 None),
1548 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001549 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001550 None),
1551 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001552 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001553 None),
1554 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001555 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001556 None),
1557 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001558 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001559 None),
1560 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001561 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001562 None),
1563 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001564 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001565 None),
1566 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001567 (b'\xcd\x81',
1568 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001569 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001570 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001571 None),
1572 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001573 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001574 None),
1575 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001576 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001577 None),
1578 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001579 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001580 None),
1581 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001582 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001583 None),
1584 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001585 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001586 None),
1587 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001588 (b'foo\xef\xb9\xb6bar',
1589 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001590 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001591 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001592 None),
1593 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001594 (b'\xd8\xa71\xd8\xa8',
1595 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001596 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001597 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001598 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001599 # None),
1600 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001601 # 3.44 Larger test (shrinking).
1602 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001603 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1604 b'\xaa\xce\xb0\xe2\x80\x80',
1605 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001606 # 3.45 Larger test (expanding).
1607 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001608 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1609 b'\x80',
1610 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1611 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1612 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001613 ]
1614
1615
1616class NameprepTest(unittest.TestCase):
1617 def test_nameprep(self):
1618 from encodings.idna import nameprep
1619 for pos, (orig, prepped) in enumerate(nameprep_tests):
1620 if orig is None:
1621 # Skipped
1622 continue
1623 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001624 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001625 if prepped is None:
1626 # Input contains prohibited characters
1627 self.assertRaises(UnicodeError, nameprep, orig)
1628 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001629 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001630 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001631 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001632 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001633 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001634
Victor Stinnerf96418d2015-09-21 23:06:27 +02001635
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001636class IDNACodecTest(unittest.TestCase):
1637 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001638 self.assertEqual(str(b"python.org", "idna"), "python.org")
1639 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1640 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1641 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001642
1643 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001644 self.assertEqual("python.org".encode("idna"), b"python.org")
1645 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1646 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1647 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001648
Martin v. Löwis8b595142005-08-25 11:03:38 +00001649 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001650 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001651 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001652 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001653
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001654 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001655 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001656 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001657 "python.org"
1658 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001659 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001660 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001661 "python.org."
1662 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001663 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001664 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001665 "pyth\xf6n.org."
1666 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001667 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001668 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001669 "pyth\xf6n.org."
1670 )
1671
1672 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001673 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1674 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1675 self.assertEqual(decoder.decode(b"rg"), "")
1676 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001677
1678 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001679 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1680 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1681 self.assertEqual(decoder.decode(b"rg."), "org.")
1682 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001683
1684 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001685 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001686 b"".join(codecs.iterencode("python.org", "idna")),
1687 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001688 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001689 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001690 b"".join(codecs.iterencode("python.org.", "idna")),
1691 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001692 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001693 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001694 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1695 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001696 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001697 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001698 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1699 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001700 )
1701
1702 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001703 self.assertEqual(encoder.encode("\xe4x"), b"")
1704 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1705 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001706
1707 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001708 self.assertEqual(encoder.encode("\xe4x"), b"")
1709 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1710 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001711
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001712 def test_errors(self):
1713 """Only supports "strict" error handler"""
1714 "python.org".encode("idna", "strict")
1715 b"python.org".decode("idna", "strict")
1716 for errors in ("ignore", "replace", "backslashreplace",
1717 "surrogateescape"):
1718 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1719 self.assertRaises(Exception,
1720 b"python.org".decode, "idna", errors)
1721
Victor Stinnerf96418d2015-09-21 23:06:27 +02001722
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001723class CodecsModuleTest(unittest.TestCase):
1724
1725 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001726 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1727 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001728 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001729 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001730 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001731
Victor Stinnera57dfd02014-05-14 17:13:14 +02001732 # test keywords
1733 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1734 '\xe4\xf6\xfc')
1735 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1736 '[]')
1737
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001738 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001739 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1740 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001741 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001742 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001743 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001744 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001745
Victor Stinnera57dfd02014-05-14 17:13:14 +02001746 # test keywords
1747 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1748 b'\xe4\xf6\xfc')
1749 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1750 b'[]')
1751
Walter Dörwald063e1e82004-10-28 13:04:26 +00001752 def test_register(self):
1753 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001754 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001755
1756 def test_lookup(self):
1757 self.assertRaises(TypeError, codecs.lookup)
1758 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001759 self.assertRaises(LookupError, codecs.lookup, " ")
1760
1761 def test_getencoder(self):
1762 self.assertRaises(TypeError, codecs.getencoder)
1763 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1764
1765 def test_getdecoder(self):
1766 self.assertRaises(TypeError, codecs.getdecoder)
1767 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1768
1769 def test_getreader(self):
1770 self.assertRaises(TypeError, codecs.getreader)
1771 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1772
1773 def test_getwriter(self):
1774 self.assertRaises(TypeError, codecs.getwriter)
1775 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001776
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001777 def test_lookup_issue1813(self):
1778 # Issue #1813: under Turkish locales, lookup of some codecs failed
1779 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001780 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001781 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1782 try:
1783 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1784 except locale.Error:
1785 # Unsupported locale on this system
1786 self.skipTest('test needs Turkish locale')
1787 c = codecs.lookup('ASCII')
1788 self.assertEqual(c.name, 'ascii')
1789
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001790 def test_all(self):
1791 api = (
1792 "encode", "decode",
1793 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1794 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1795 "getencoder", "getdecoder", "getincrementalencoder",
1796 "getincrementaldecoder", "getreader", "getwriter",
1797 "register_error", "lookup_error",
1798 "strict_errors", "replace_errors", "ignore_errors",
1799 "xmlcharrefreplace_errors", "backslashreplace_errors",
1800 "namereplace_errors",
1801 "open", "EncodedFile",
1802 "iterencode", "iterdecode",
1803 "BOM", "BOM_BE", "BOM_LE",
1804 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1805 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1806 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1807 "StreamReaderWriter", "StreamRecoder",
1808 )
1809 self.assertCountEqual(api, codecs.__all__)
1810 for api in codecs.__all__:
1811 getattr(codecs, api)
1812
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001813 def test_open(self):
1814 self.addCleanup(support.unlink, support.TESTFN)
1815 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1816 with self.subTest(mode), \
1817 codecs.open(support.TESTFN, mode, 'ascii') as file:
1818 self.assertIsInstance(file, codecs.StreamReaderWriter)
1819
1820 def test_undefined(self):
1821 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1822 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1823 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1824 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1825 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1826 self.assertRaises(UnicodeError,
1827 codecs.encode, 'abc', 'undefined', errors)
1828 self.assertRaises(UnicodeError,
1829 codecs.decode, b'abc', 'undefined', errors)
1830
Victor Stinnerf96418d2015-09-21 23:06:27 +02001831
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001832class StreamReaderTest(unittest.TestCase):
1833
1834 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001835 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001836 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001837
1838 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001839 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001840 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001841
Victor Stinnerf96418d2015-09-21 23:06:27 +02001842
Thomas Wouters89f507f2006-12-13 04:49:30 +00001843class EncodedFileTest(unittest.TestCase):
1844
1845 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001846 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001847 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001848 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001849
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001850 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001851 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001852 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001853 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001854
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001855all_unicode_encodings = [
1856 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001857 "big5",
1858 "big5hkscs",
1859 "charmap",
1860 "cp037",
1861 "cp1006",
1862 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001863 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001864 "cp1140",
1865 "cp1250",
1866 "cp1251",
1867 "cp1252",
1868 "cp1253",
1869 "cp1254",
1870 "cp1255",
1871 "cp1256",
1872 "cp1257",
1873 "cp1258",
1874 "cp424",
1875 "cp437",
1876 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001877 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001878 "cp737",
1879 "cp775",
1880 "cp850",
1881 "cp852",
1882 "cp855",
1883 "cp856",
1884 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001885 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001886 "cp860",
1887 "cp861",
1888 "cp862",
1889 "cp863",
1890 "cp864",
1891 "cp865",
1892 "cp866",
1893 "cp869",
1894 "cp874",
1895 "cp875",
1896 "cp932",
1897 "cp949",
1898 "cp950",
1899 "euc_jis_2004",
1900 "euc_jisx0213",
1901 "euc_jp",
1902 "euc_kr",
1903 "gb18030",
1904 "gb2312",
1905 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001906 "hp_roman8",
1907 "hz",
1908 "idna",
1909 "iso2022_jp",
1910 "iso2022_jp_1",
1911 "iso2022_jp_2",
1912 "iso2022_jp_2004",
1913 "iso2022_jp_3",
1914 "iso2022_jp_ext",
1915 "iso2022_kr",
1916 "iso8859_1",
1917 "iso8859_10",
1918 "iso8859_11",
1919 "iso8859_13",
1920 "iso8859_14",
1921 "iso8859_15",
1922 "iso8859_16",
1923 "iso8859_2",
1924 "iso8859_3",
1925 "iso8859_4",
1926 "iso8859_5",
1927 "iso8859_6",
1928 "iso8859_7",
1929 "iso8859_8",
1930 "iso8859_9",
1931 "johab",
1932 "koi8_r",
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03001933 "koi8_t",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001934 "koi8_u",
Serhiy Storchakaad8a1c32015-05-12 23:16:55 +03001935 "kz1048",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001936 "latin_1",
1937 "mac_cyrillic",
1938 "mac_greek",
1939 "mac_iceland",
1940 "mac_latin2",
1941 "mac_roman",
1942 "mac_turkish",
1943 "palmos",
1944 "ptcp154",
1945 "punycode",
1946 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001947 "shift_jis",
1948 "shift_jis_2004",
1949 "shift_jisx0213",
1950 "tis_620",
1951 "unicode_escape",
1952 "unicode_internal",
1953 "utf_16",
1954 "utf_16_be",
1955 "utf_16_le",
1956 "utf_7",
1957 "utf_8",
1958]
1959
1960if hasattr(codecs, "mbcs_encode"):
1961 all_unicode_encodings.append("mbcs")
Steve Dowerf5aba582016-09-06 19:42:27 -07001962if hasattr(codecs, "oem_encode"):
1963 all_unicode_encodings.append("oem")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001964
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001965# The following encoding is not tested, because it's not supposed
1966# to work:
1967# "undefined"
1968
1969# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001970broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001971 "punycode",
1972 "unicode_internal"
1973]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001974
Victor Stinnerf96418d2015-09-21 23:06:27 +02001975
Walter Dörwald3abcb012007-04-16 22:10:50 +00001976class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001977 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001978 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001979 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001980 name = codecs.lookup(encoding).name
1981 if encoding.endswith("_codec"):
1982 name += "_codec"
1983 elif encoding == "latin_1":
1984 name = "latin_1"
1985 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001986
Ezio Melottiadc417c2011-11-17 12:23:34 +02001987 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001988 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001989 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001990 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001991 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001992 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001993
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001994 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001995 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001996 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001997 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001998 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001999 for c in s:
2000 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002001 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00002002 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00002003 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002004 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02002005 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002006 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002007 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002008 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002009 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002010 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002011
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002012 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002013 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00002014 try:
2015 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002016 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00002017 pass
2018 else:
2019 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002020 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00002021 for c in s:
2022 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002023 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00002024 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002025 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00002026 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002027 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002028 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002029 self.assertEqual(decodedresult, s,
2030 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00002031
2032 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002033 result = "".join(codecs.iterdecode(
2034 codecs.iterencode(s, encoding), encoding))
2035 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00002036
2037 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002038 result = "".join(codecs.iterdecode(
2039 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002040 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00002041
Victor Stinner554f3f02010-06-16 23:33:54 +00002042 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00002043 # check incremental decoder/encoder with errors argument
2044 try:
2045 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002046 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00002047 pass
2048 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002049 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002050 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002051 decodedresult = "".join(decoder.decode(bytes([c]))
2052 for c in encodedresult)
2053 self.assertEqual(decodedresult, s,
2054 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002055
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002056 @support.cpython_only
2057 def test_basics_capi(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002058 s = "abc123" # all codecs should be able to encode these
2059 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002060 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002061 # check incremental decoder/encoder (fetched via the C API)
2062 try:
Victor Stinner3d4226a2018-08-29 22:21:32 +02002063 cencoder = _testcapi.codec_incrementalencoder(encoding)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002064 except LookupError: # no IncrementalEncoder
2065 pass
2066 else:
2067 # check C API
2068 encodedresult = b""
2069 for c in s:
2070 encodedresult += cencoder.encode(c)
2071 encodedresult += cencoder.encode("", True)
Victor Stinner3d4226a2018-08-29 22:21:32 +02002072 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002073 decodedresult = ""
2074 for c in encodedresult:
2075 decodedresult += cdecoder.decode(bytes([c]))
2076 decodedresult += cdecoder.decode(b"", True)
2077 self.assertEqual(decodedresult, s,
2078 "encoding=%r" % encoding)
2079
2080 if encoding not in ("idna", "mbcs"):
2081 # check incremental decoder/encoder with errors argument
2082 try:
Victor Stinner3d4226a2018-08-29 22:21:32 +02002083 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002084 except LookupError: # no IncrementalEncoder
2085 pass
2086 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002087 encodedresult = b"".join(cencoder.encode(c) for c in s)
Victor Stinner3d4226a2018-08-29 22:21:32 +02002088 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002089 decodedresult = "".join(cdecoder.decode(bytes([c]))
2090 for c in encodedresult)
2091 self.assertEqual(decodedresult, s,
2092 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002093
Walter Dörwald729c31f2005-03-14 19:06:30 +00002094 def test_seek(self):
2095 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002096 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00002097 for encoding in all_unicode_encodings:
2098 if encoding == "idna": # FIXME: See SF bug #1163178
2099 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002100 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00002101 continue
Victor Stinner05010702011-05-27 16:50:40 +02002102 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00002103 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00002104 # Test that calling seek resets the internal codec state and buffers
2105 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002106 data = reader.read()
2107 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00002108
Walter Dörwalde22d3392005-11-17 08:52:34 +00002109 def test_bad_decode_args(self):
2110 for encoding in all_unicode_encodings:
2111 decoder = codecs.getdecoder(encoding)
2112 self.assertRaises(TypeError, decoder)
2113 if encoding not in ("idna", "punycode"):
2114 self.assertRaises(TypeError, decoder, 42)
2115
2116 def test_bad_encode_args(self):
2117 for encoding in all_unicode_encodings:
2118 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02002119 with support.check_warnings():
2120 # unicode-internal has been deprecated
2121 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002122
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002123 def test_encoding_map_type_initialized(self):
2124 from encodings import cp1140
2125 # This used to crash, we are only verifying there's no crash.
2126 table_type = type(cp1140.encoding_table)
2127 self.assertEqual(table_type, table_type)
2128
Walter Dörwald3abcb012007-04-16 22:10:50 +00002129 def test_decoder_state(self):
2130 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002131 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002132 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002133 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002134 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2135 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2136
Victor Stinnerf96418d2015-09-21 23:06:27 +02002137
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002138class CharmapTest(unittest.TestCase):
2139 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002140 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002141 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002142 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002143 )
2144
Ezio Melottib3aedd42010-11-20 19:04:17 +00002145 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002146 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2147 ("\U0010FFFFbc", 3)
2148 )
2149
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002150 self.assertRaises(UnicodeDecodeError,
2151 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2152 )
2153
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002154 self.assertRaises(UnicodeDecodeError,
2155 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2156 )
2157
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002158 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002159 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002160 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002161 )
2162
Ezio Melottib3aedd42010-11-20 19:04:17 +00002163 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002164 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002165 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002166 )
2167
Ezio Melottib3aedd42010-11-20 19:04:17 +00002168 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002169 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2170 ("ab\\x02", 3)
2171 )
2172
2173 self.assertEqual(
2174 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2175 ("ab\\x02", 3)
2176 )
2177
2178 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002179 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002180 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002181 )
2182
Ezio Melottib3aedd42010-11-20 19:04:17 +00002183 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002184 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002185 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002186 )
2187
Guido van Rossum805365e2007-05-07 22:24:25 +00002188 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002189 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002190 codecs.charmap_decode(allbytes, "ignore", ""),
2191 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002192 )
2193
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002194 def test_decode_with_int2str_map(self):
2195 self.assertEqual(
2196 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2197 {0: 'a', 1: 'b', 2: 'c'}),
2198 ("abc", 3)
2199 )
2200
2201 self.assertEqual(
2202 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2203 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2204 ("AaBbCc", 3)
2205 )
2206
2207 self.assertEqual(
2208 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2209 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2210 ("\U0010FFFFbc", 3)
2211 )
2212
2213 self.assertEqual(
2214 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2215 {0: 'a', 1: 'b', 2: ''}),
2216 ("ab", 3)
2217 )
2218
2219 self.assertRaises(UnicodeDecodeError,
2220 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2221 {0: 'a', 1: 'b'}
2222 )
2223
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002224 self.assertRaises(UnicodeDecodeError,
2225 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2226 {0: 'a', 1: 'b', 2: None}
2227 )
2228
2229 # Issue #14850
2230 self.assertRaises(UnicodeDecodeError,
2231 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2232 {0: 'a', 1: 'b', 2: '\ufffe'}
2233 )
2234
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002235 self.assertEqual(
2236 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2237 {0: 'a', 1: 'b'}),
2238 ("ab\ufffd", 3)
2239 )
2240
2241 self.assertEqual(
2242 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2243 {0: 'a', 1: 'b', 2: None}),
2244 ("ab\ufffd", 3)
2245 )
2246
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002247 # Issue #14850
2248 self.assertEqual(
2249 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2250 {0: 'a', 1: 'b', 2: '\ufffe'}),
2251 ("ab\ufffd", 3)
2252 )
2253
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002254 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002255 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2256 {0: 'a', 1: 'b'}),
2257 ("ab\\x02", 3)
2258 )
2259
2260 self.assertEqual(
2261 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2262 {0: 'a', 1: 'b', 2: None}),
2263 ("ab\\x02", 3)
2264 )
2265
2266 # Issue #14850
2267 self.assertEqual(
2268 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2269 {0: 'a', 1: 'b', 2: '\ufffe'}),
2270 ("ab\\x02", 3)
2271 )
2272
2273 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002274 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2275 {0: 'a', 1: 'b'}),
2276 ("ab", 3)
2277 )
2278
2279 self.assertEqual(
2280 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2281 {0: 'a', 1: 'b', 2: None}),
2282 ("ab", 3)
2283 )
2284
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002285 # Issue #14850
2286 self.assertEqual(
2287 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2288 {0: 'a', 1: 'b', 2: '\ufffe'}),
2289 ("ab", 3)
2290 )
2291
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002292 allbytes = bytes(range(256))
2293 self.assertEqual(
2294 codecs.charmap_decode(allbytes, "ignore", {}),
2295 ("", len(allbytes))
2296 )
2297
2298 def test_decode_with_int2int_map(self):
2299 a = ord('a')
2300 b = ord('b')
2301 c = ord('c')
2302
2303 self.assertEqual(
2304 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2305 {0: a, 1: b, 2: c}),
2306 ("abc", 3)
2307 )
2308
2309 # Issue #15379
2310 self.assertEqual(
2311 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2312 {0: 0x10FFFF, 1: b, 2: c}),
2313 ("\U0010FFFFbc", 3)
2314 )
2315
Antoine Pitroua1f76552012-09-23 20:00:04 +02002316 self.assertEqual(
2317 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2318 {0: sys.maxunicode, 1: b, 2: c}),
2319 (chr(sys.maxunicode) + "bc", 3)
2320 )
2321
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002322 self.assertRaises(TypeError,
2323 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002324 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002325 )
2326
2327 self.assertRaises(UnicodeDecodeError,
2328 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2329 {0: a, 1: b},
2330 )
2331
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002332 self.assertRaises(UnicodeDecodeError,
2333 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2334 {0: a, 1: b, 2: 0xFFFE},
2335 )
2336
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002337 self.assertEqual(
2338 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2339 {0: a, 1: b}),
2340 ("ab\ufffd", 3)
2341 )
2342
2343 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002344 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2345 {0: a, 1: b, 2: 0xFFFE}),
2346 ("ab\ufffd", 3)
2347 )
2348
2349 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002350 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2351 {0: a, 1: b}),
2352 ("ab\\x02", 3)
2353 )
2354
2355 self.assertEqual(
2356 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2357 {0: a, 1: b, 2: 0xFFFE}),
2358 ("ab\\x02", 3)
2359 )
2360
2361 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002362 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2363 {0: a, 1: b}),
2364 ("ab", 3)
2365 )
2366
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002367 self.assertEqual(
2368 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2369 {0: a, 1: b, 2: 0xFFFE}),
2370 ("ab", 3)
2371 )
2372
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002373
Thomas Wouters89f507f2006-12-13 04:49:30 +00002374class WithStmtTest(unittest.TestCase):
2375 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002376 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002377 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2378 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002379 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002380
2381 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002382 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002383 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002384 with codecs.StreamReaderWriter(f, info.streamreader,
2385 info.streamwriter, 'strict') as srw:
2386 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002387
Victor Stinnerf96418d2015-09-21 23:06:27 +02002388
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002389class TypesTest(unittest.TestCase):
2390 def test_decode_unicode(self):
2391 # Most decoders don't accept unicode input
2392 decoders = [
2393 codecs.utf_7_decode,
2394 codecs.utf_8_decode,
2395 codecs.utf_16_le_decode,
2396 codecs.utf_16_be_decode,
2397 codecs.utf_16_ex_decode,
2398 codecs.utf_32_decode,
2399 codecs.utf_32_le_decode,
2400 codecs.utf_32_be_decode,
2401 codecs.utf_32_ex_decode,
2402 codecs.latin_1_decode,
2403 codecs.ascii_decode,
2404 codecs.charmap_decode,
2405 ]
2406 if hasattr(codecs, "mbcs_decode"):
2407 decoders.append(codecs.mbcs_decode)
2408 for decoder in decoders:
2409 self.assertRaises(TypeError, decoder, "xxx")
2410
2411 def test_unicode_escape(self):
Martin Panter119e5022016-04-16 09:28:57 +00002412 # Escape-decoding a unicode string is supported and gives the same
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002413 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002414 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2415 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2416 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2417 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002418
Victor Stinnere3b47152011-12-09 20:49:49 +01002419 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2420 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002421 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2422 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002423
2424 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2425 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002426 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2427 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002428
Serhiy Storchakad6793772013-01-29 10:20:44 +02002429
2430class UnicodeEscapeTest(unittest.TestCase):
2431 def test_empty(self):
2432 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2433 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2434
2435 def test_raw_encode(self):
2436 encode = codecs.unicode_escape_encode
2437 for b in range(32, 127):
2438 if b != b'\\'[0]:
2439 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2440
2441 def test_raw_decode(self):
2442 decode = codecs.unicode_escape_decode
2443 for b in range(256):
2444 if b != b'\\'[0]:
2445 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2446
2447 def test_escape_encode(self):
2448 encode = codecs.unicode_escape_encode
2449 check = coding_checker(self, encode)
2450 check('\t', br'\t')
2451 check('\n', br'\n')
2452 check('\r', br'\r')
2453 check('\\', br'\\')
2454 for b in range(32):
2455 if chr(b) not in '\t\n\r':
2456 check(chr(b), ('\\x%02x' % b).encode())
2457 for b in range(127, 256):
2458 check(chr(b), ('\\x%02x' % b).encode())
2459 check('\u20ac', br'\u20ac')
2460 check('\U0001d120', br'\U0001d120')
2461
2462 def test_escape_decode(self):
2463 decode = codecs.unicode_escape_decode
2464 check = coding_checker(self, decode)
2465 check(b"[\\\n]", "[]")
2466 check(br'[\"]', '["]')
2467 check(br"[\']", "[']")
2468 check(br"[\\]", r"[\]")
2469 check(br"[\a]", "[\x07]")
2470 check(br"[\b]", "[\x08]")
2471 check(br"[\t]", "[\x09]")
2472 check(br"[\n]", "[\x0a]")
2473 check(br"[\v]", "[\x0b]")
2474 check(br"[\f]", "[\x0c]")
2475 check(br"[\r]", "[\x0d]")
2476 check(br"[\7]", "[\x07]")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002477 check(br"[\78]", "[\x078]")
2478 check(br"[\41]", "[!]")
2479 check(br"[\418]", "[!8]")
2480 check(br"[\101]", "[A]")
2481 check(br"[\1010]", "[A0]")
2482 check(br"[\x41]", "[A]")
2483 check(br"[\x410]", "[A0]")
2484 check(br"\u20ac", "\u20ac")
2485 check(br"\U0001d120", "\U0001d120")
R David Murray110b6fe2016-09-08 15:34:08 -04002486 for i in range(97, 123):
2487 b = bytes([i])
2488 if b not in b'abfnrtuvx':
2489 with self.assertWarns(DeprecationWarning):
2490 check(b"\\" + b, "\\" + chr(i))
2491 if b.upper() not in b'UN':
2492 with self.assertWarns(DeprecationWarning):
2493 check(b"\\" + b.upper(), "\\" + chr(i-32))
2494 with self.assertWarns(DeprecationWarning):
2495 check(br"\8", "\\8")
2496 with self.assertWarns(DeprecationWarning):
2497 check(br"\9", "\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03002498 with self.assertWarns(DeprecationWarning):
2499 check(b"\\\xfa", "\\\xfa")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002500
2501 def test_decode_errors(self):
2502 decode = codecs.unicode_escape_decode
2503 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2504 for i in range(d):
2505 self.assertRaises(UnicodeDecodeError, decode,
2506 b"\\" + c + b"0"*i)
2507 self.assertRaises(UnicodeDecodeError, decode,
2508 b"[\\" + c + b"0"*i + b"]")
2509 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2510 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2511 self.assertEqual(decode(data, "replace"),
2512 ("[\ufffd]\ufffd", len(data)))
2513 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2514 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2515 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2516
2517
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002518class RawUnicodeEscapeTest(unittest.TestCase):
2519 def test_empty(self):
2520 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2521 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2522
2523 def test_raw_encode(self):
2524 encode = codecs.raw_unicode_escape_encode
2525 for b in range(256):
2526 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2527
2528 def test_raw_decode(self):
2529 decode = codecs.raw_unicode_escape_decode
2530 for b in range(256):
2531 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2532
2533 def test_escape_encode(self):
2534 encode = codecs.raw_unicode_escape_encode
2535 check = coding_checker(self, encode)
2536 for b in range(256):
2537 if b not in b'uU':
2538 check('\\' + chr(b), b'\\' + bytes([b]))
2539 check('\u20ac', br'\u20ac')
2540 check('\U0001d120', br'\U0001d120')
2541
2542 def test_escape_decode(self):
2543 decode = codecs.raw_unicode_escape_decode
2544 check = coding_checker(self, decode)
2545 for b in range(256):
2546 if b not in b'uU':
2547 check(b'\\' + bytes([b]), '\\' + chr(b))
2548 check(br"\u20ac", "\u20ac")
2549 check(br"\U0001d120", "\U0001d120")
2550
2551 def test_decode_errors(self):
2552 decode = codecs.raw_unicode_escape_decode
2553 for c, d in (b'u', 4), (b'U', 4):
2554 for i in range(d):
2555 self.assertRaises(UnicodeDecodeError, decode,
2556 b"\\" + c + b"0"*i)
2557 self.assertRaises(UnicodeDecodeError, decode,
2558 b"[\\" + c + b"0"*i + b"]")
2559 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2560 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2561 self.assertEqual(decode(data, "replace"),
2562 ("[\ufffd]\ufffd", len(data)))
2563 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2564 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2565 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2566
2567
Berker Peksag4a72a7b2016-09-16 17:31:06 +03002568class EscapeEncodeTest(unittest.TestCase):
2569
2570 def test_escape_encode(self):
2571 tests = [
2572 (b'', (b'', 0)),
2573 (b'foobar', (b'foobar', 6)),
2574 (b'spam\0eggs', (b'spam\\x00eggs', 9)),
2575 (b'a\'b', (b"a\\'b", 3)),
2576 (b'b\\c', (b'b\\\\c', 3)),
2577 (b'c\nd', (b'c\\nd', 3)),
2578 (b'd\re', (b'd\\re', 3)),
2579 (b'f\x7fg', (b'f\\x7fg', 3)),
2580 ]
2581 for data, output in tests:
2582 with self.subTest(data=data):
2583 self.assertEqual(codecs.escape_encode(data), output)
2584 self.assertRaises(TypeError, codecs.escape_encode, 'spam')
2585 self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam'))
2586
2587
Martin v. Löwis43c57782009-05-10 08:15:24 +00002588class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002589
2590 def test_utf8(self):
2591 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002592 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002593 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002594 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002595 b"foo\x80bar")
2596 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002597 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002598 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002599 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002600 b"\xed\xb0\x80")
2601
2602 def test_ascii(self):
2603 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002604 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002605 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002606 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002607 b"foo\x80bar")
2608
2609 def test_charmap(self):
2610 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002611 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002612 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002613 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002614 b"foo\xa5bar")
2615
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002616 def test_latin1(self):
2617 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002618 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002619 b"\xe4\xeb\xef\xf6\xfc")
2620
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002621
Victor Stinner3fed0872010-05-22 02:16:27 +00002622class BomTest(unittest.TestCase):
2623 def test_seek0(self):
2624 data = "1234567890"
2625 tests = ("utf-16",
2626 "utf-16-le",
2627 "utf-16-be",
2628 "utf-32",
2629 "utf-32-le",
2630 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002631 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002632 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002633 # Check if the BOM is written only once
2634 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002635 f.write(data)
2636 f.write(data)
2637 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002638 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002639 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002640 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002641
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002642 # Check that the BOM is written after a seek(0)
2643 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2644 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002645 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002646 f.seek(0)
2647 f.write(data)
2648 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002649 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002650
2651 # (StreamWriter) Check that the BOM is written after a seek(0)
2652 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002653 f.writer.write(data[0])
2654 self.assertNotEqual(f.writer.tell(), 0)
2655 f.writer.seek(0)
2656 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002657 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002658 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002659
Victor Stinner05010702011-05-27 16:50:40 +02002660 # Check that the BOM is not written after a seek() at a position
2661 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002662 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2663 f.write(data)
2664 f.seek(f.tell())
2665 f.write(data)
2666 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002667 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002668
Victor Stinner05010702011-05-27 16:50:40 +02002669 # (StreamWriter) Check that the BOM is not written after a seek()
2670 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002671 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002672 f.writer.write(data)
2673 f.writer.seek(f.writer.tell())
2674 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002675 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002676 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002677
Victor Stinner3fed0872010-05-22 02:16:27 +00002678
Georg Brandl02524622010-12-02 18:06:51 +00002679bytes_transform_encodings = [
2680 "base64_codec",
2681 "uu_codec",
2682 "quopri_codec",
2683 "hex_codec",
2684]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002685
2686transform_aliases = {
2687 "base64_codec": ["base64", "base_64"],
2688 "uu_codec": ["uu"],
2689 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2690 "hex_codec": ["hex"],
2691 "rot_13": ["rot13"],
2692}
2693
Georg Brandl02524622010-12-02 18:06:51 +00002694try:
2695 import zlib
2696except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002697 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002698else:
2699 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002700 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002701try:
2702 import bz2
2703except ImportError:
2704 pass
2705else:
2706 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002707 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002708
Victor Stinnerf96418d2015-09-21 23:06:27 +02002709
Georg Brandl02524622010-12-02 18:06:51 +00002710class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002711
Georg Brandl02524622010-12-02 18:06:51 +00002712 def test_basics(self):
2713 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002714 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002715 with self.subTest(encoding=encoding):
2716 # generic codecs interface
2717 (o, size) = codecs.getencoder(encoding)(binput)
2718 self.assertEqual(size, len(binput))
2719 (i, size) = codecs.getdecoder(encoding)(o)
2720 self.assertEqual(size, len(o))
2721 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002722
Georg Brandl02524622010-12-02 18:06:51 +00002723 def test_read(self):
2724 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002725 with self.subTest(encoding=encoding):
2726 sin = codecs.encode(b"\x80", encoding)
2727 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2728 sout = reader.read()
2729 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002730
2731 def test_readline(self):
2732 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002733 with self.subTest(encoding=encoding):
2734 sin = codecs.encode(b"\x80", encoding)
2735 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2736 sout = reader.readline()
2737 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002738
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002739 def test_buffer_api_usage(self):
2740 # We check all the transform codecs accept memoryview input
2741 # for encoding and decoding
2742 # and also that they roundtrip correctly
2743 original = b"12345\x80"
2744 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002745 with self.subTest(encoding=encoding):
2746 data = original
2747 view = memoryview(data)
2748 data = codecs.encode(data, encoding)
2749 view_encoded = codecs.encode(view, encoding)
2750 self.assertEqual(view_encoded, data)
2751 view = memoryview(data)
2752 data = codecs.decode(data, encoding)
2753 self.assertEqual(data, original)
2754 view_decoded = codecs.decode(view, encoding)
2755 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002756
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002757 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002758 # Check binary -> binary codecs give a good error for str input
2759 bad_input = "bad input type"
2760 for encoding in bytes_transform_encodings:
2761 with self.subTest(encoding=encoding):
R David Murray44b548d2016-09-08 13:59:53 -04002762 fmt = (r"{!r} is not a text encoding; "
2763 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002764 msg = fmt.format(encoding)
2765 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002766 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002767 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002768
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002769 def test_text_to_binary_blacklists_text_transforms(self):
2770 # Check str.encode gives a good error message for str -> str codecs
2771 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002772 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002773 with self.assertRaisesRegex(LookupError, msg):
2774 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002775
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002776 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002777 # Check bytes.decode and bytearray.decode give a good error
2778 # message for binary -> binary codecs
2779 data = b"encode first to ensure we meet any format restrictions"
2780 for encoding in bytes_transform_encodings:
2781 with self.subTest(encoding=encoding):
2782 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002783 fmt = (r"{!r} is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002784 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002785 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002786 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002787 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002788 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002789 bytearray(encoded_data).decode(encoding)
2790
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002791 def test_binary_to_text_blacklists_text_transforms(self):
2792 # Check str -> str codec gives a good error for binary input
2793 for bad_input in (b"immutable", bytearray(b"mutable")):
2794 with self.subTest(bad_input=bad_input):
2795 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002796 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002797 with self.assertRaisesRegex(LookupError, msg) as failure:
2798 bad_input.decode("rot_13")
2799 self.assertIsNone(failure.exception.__cause__)
2800
Zachary Wareefa2e042013-12-30 14:54:11 -06002801 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002802 def test_custom_zlib_error_is_wrapped(self):
2803 # Check zlib codec gives a good error for malformed input
2804 msg = "^decoding with 'zlib_codec' codec failed"
2805 with self.assertRaisesRegex(Exception, msg) as failure:
2806 codecs.decode(b"hello", "zlib_codec")
2807 self.assertIsInstance(failure.exception.__cause__,
2808 type(failure.exception))
2809
2810 def test_custom_hex_error_is_wrapped(self):
2811 # Check hex codec gives a good error for malformed input
2812 msg = "^decoding with 'hex_codec' codec failed"
2813 with self.assertRaisesRegex(Exception, msg) as failure:
2814 codecs.decode(b"hello", "hex_codec")
2815 self.assertIsInstance(failure.exception.__cause__,
2816 type(failure.exception))
2817
2818 # Unfortunately, the bz2 module throws OSError, which the codec
2819 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002820
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002821 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2822 def test_aliases(self):
2823 for codec_name, aliases in transform_aliases.items():
2824 expected_name = codecs.lookup(codec_name).name
2825 for alias in aliases:
2826 with self.subTest(alias=alias):
2827 info = codecs.lookup(alias)
2828 self.assertEqual(info.name, expected_name)
2829
Martin Panter06171bd2015-09-12 00:34:28 +00002830 def test_quopri_stateless(self):
2831 # Should encode with quotetabs=True
2832 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2833 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2834 # But should still support unescaped tabs and spaces
2835 unescaped = b"space tab eol\n"
2836 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2837
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002838 def test_uu_invalid(self):
2839 # Missing "begin" line
2840 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2841
Nick Coghlan8b097b42013-11-13 23:49:21 +10002842
2843# The codec system tries to wrap exceptions in order to ensure the error
2844# mentions the operation being performed and the codec involved. We
2845# currently *only* want this to happen for relatively stateless
2846# exceptions, where the only significant information they contain is their
2847# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002848
2849# Use a local codec registry to avoid appearing to leak objects when
Martin Panter119e5022016-04-16 09:28:57 +00002850# registering multiple search functions
Nick Coghlan4e553e22013-11-16 00:35:34 +10002851_TEST_CODECS = {}
2852
2853def _get_test_codec(codec_name):
2854 return _TEST_CODECS.get(codec_name)
2855codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2856
Nick Coghlan8fad1672014-09-15 23:50:44 +12002857try:
2858 # Issue #22166: Also need to clear the internal cache in CPython
2859 from _codecs import _forget_codec
2860except ImportError:
2861 def _forget_codec(codec_name):
2862 pass
2863
2864
Nick Coghlan8b097b42013-11-13 23:49:21 +10002865class ExceptionChainingTest(unittest.TestCase):
2866
2867 def setUp(self):
2868 # There's no way to unregister a codec search function, so we just
2869 # ensure we render this one fairly harmless after the test
2870 # case finishes by using the test case repr as the codec name
2871 # The codecs module normalizes codec names, although this doesn't
2872 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002873 # We also make sure we use a truly unique id for the custom codec
2874 # to avoid issues with the codec cache when running these tests
2875 # multiple times (e.g. when hunting for refleaks)
2876 unique_id = repr(self) + str(id(self))
2877 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2878
2879 # We store the object to raise on the instance because of a bad
2880 # interaction between the codec caching (which means we can't
2881 # recreate the codec entry) and regrtest refleak hunting (which
2882 # runs the same test instance multiple times). This means we
2883 # need to ensure the codecs call back in to the instance to find
2884 # out which exception to raise rather than binding them in a
2885 # closure to an object that may change on the next run
2886 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002887
Nick Coghlan4e553e22013-11-16 00:35:34 +10002888 def tearDown(self):
2889 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002890 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2891 encodings._cache.pop(self.codec_name, None)
2892 try:
2893 _forget_codec(self.codec_name)
2894 except KeyError:
2895 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002896
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002897 def set_codec(self, encode, decode):
2898 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002899 name=self.codec_name)
2900 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002901
2902 @contextlib.contextmanager
2903 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002904 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002905 operation, self.codec_name, exc_type.__name__, msg)
2906 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2907 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002908 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002909 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002910
2911 def raise_obj(self, *args, **kwds):
2912 # Helper to dynamically change the object raised by a test codec
2913 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002914
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002915 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002916 self.obj_to_raise = obj_to_raise
2917 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002918 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002919 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002920 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002921 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002922 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002923 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002924 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002925 codecs.decode(b"bytes input", self.codec_name)
2926
2927 def test_raise_by_type(self):
2928 self.check_wrapped(RuntimeError, "")
2929
2930 def test_raise_by_value(self):
2931 msg = "This should be wrapped"
2932 self.check_wrapped(RuntimeError(msg), msg)
2933
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002934 def test_raise_grandchild_subclass_exact_size(self):
2935 msg = "This should be wrapped"
2936 class MyRuntimeError(RuntimeError):
2937 __slots__ = ()
2938 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2939
2940 def test_raise_subclass_with_weakref_support(self):
2941 msg = "This should be wrapped"
2942 class MyRuntimeError(RuntimeError):
2943 pass
2944 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2945
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002946 def check_not_wrapped(self, obj_to_raise, msg):
2947 def raise_obj(*args, **kwds):
2948 raise obj_to_raise
2949 self.set_codec(raise_obj, raise_obj)
2950 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002951 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002952 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002953 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002954 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002955 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002956 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002957 codecs.decode(b"bytes input", self.codec_name)
2958
2959 def test_init_override_is_not_wrapped(self):
2960 class CustomInit(RuntimeError):
2961 def __init__(self):
2962 pass
2963 self.check_not_wrapped(CustomInit, "")
2964
2965 def test_new_override_is_not_wrapped(self):
2966 class CustomNew(RuntimeError):
2967 def __new__(cls):
2968 return super().__new__(cls)
2969 self.check_not_wrapped(CustomNew, "")
2970
2971 def test_instance_attribute_is_not_wrapped(self):
2972 msg = "This should NOT be wrapped"
2973 exc = RuntimeError(msg)
2974 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002975 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002976
2977 def test_non_str_arg_is_not_wrapped(self):
2978 self.check_not_wrapped(RuntimeError(1), "1")
2979
2980 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002981 msg_re = r"^\('a', 'b', 'c'\)$"
2982 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002983
2984 # http://bugs.python.org/issue19609
2985 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002986 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002987 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002988 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002989 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002990 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002991 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002992 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002993 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002994 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002995 codecs.decode(b"bytes input", self.codec_name)
2996
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002997 def test_unflagged_non_text_codec_handling(self):
2998 # The stdlib non-text codecs are now marked so they're
2999 # pre-emptively skipped by the text model related methods
3000 # However, third party codecs won't be flagged, so we still make
3001 # sure the case where an inappropriate output type is produced is
3002 # handled appropriately
3003 def encode_to_str(*args, **kwds):
3004 return "not bytes!", 0
3005 def decode_to_bytes(*args, **kwds):
3006 return b"not str!", 0
3007 self.set_codec(encode_to_str, decode_to_bytes)
3008 # No input or output type checks on the codecs module functions
3009 encoded = codecs.encode(None, self.codec_name)
3010 self.assertEqual(encoded, "not bytes!")
3011 decoded = codecs.decode(None, self.codec_name)
3012 self.assertEqual(decoded, b"not str!")
3013 # Text model methods should complain
3014 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
R David Murray44b548d2016-09-08 13:59:53 -04003015 r"use codecs.encode\(\) to encode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003016 msg = fmt.format(self.codec_name)
3017 with self.assertRaisesRegex(TypeError, msg):
3018 "str_input".encode(self.codec_name)
3019 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
R David Murray44b548d2016-09-08 13:59:53 -04003020 r"use codecs.decode\(\) to decode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003021 msg = fmt.format(self.codec_name)
3022 with self.assertRaisesRegex(TypeError, msg):
3023 b"bytes input".decode(self.codec_name)
3024
Nick Coghlanfdf239a2013-10-03 00:43:22 +10003025
Georg Brandl02524622010-12-02 18:06:51 +00003026
Victor Stinner62be4fb2011-10-18 21:46:37 +02003027@unittest.skipUnless(sys.platform == 'win32',
3028 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02003029class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003030 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02003031 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02003032
Victor Stinner3a50e702011-10-18 21:21:00 +02003033 def test_invalid_code_page(self):
3034 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
3035 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02003036 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
3037 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02003038
3039 def test_code_page_name(self):
3040 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
3041 codecs.code_page_encode, 932, '\xff')
3042 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01003043 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003044 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01003045 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003046
3047 def check_decode(self, cp, tests):
3048 for raw, errors, expected in tests:
3049 if expected is not None:
3050 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01003051 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003052 except UnicodeDecodeError as err:
3053 self.fail('Unable to decode %a from "cp%s" with '
3054 'errors=%r: %s' % (raw, cp, errors, err))
3055 self.assertEqual(decoded[0], expected,
3056 '%a.decode("cp%s", %r)=%a != %a'
3057 % (raw, cp, errors, decoded[0], expected))
3058 # assert 0 <= decoded[1] <= len(raw)
3059 self.assertGreaterEqual(decoded[1], 0)
3060 self.assertLessEqual(decoded[1], len(raw))
3061 else:
3062 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01003063 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003064
3065 def check_encode(self, cp, tests):
3066 for text, errors, expected in tests:
3067 if expected is not None:
3068 try:
3069 encoded = codecs.code_page_encode(cp, text, errors)
3070 except UnicodeEncodeError as err:
3071 self.fail('Unable to encode %a to "cp%s" with '
3072 'errors=%r: %s' % (text, cp, errors, err))
3073 self.assertEqual(encoded[0], expected,
3074 '%a.encode("cp%s", %r)=%a != %a'
3075 % (text, cp, errors, encoded[0], expected))
3076 self.assertEqual(encoded[1], len(text))
3077 else:
3078 self.assertRaises(UnicodeEncodeError,
3079 codecs.code_page_encode, cp, text, errors)
3080
3081 def test_cp932(self):
3082 self.check_encode(932, (
3083 ('abc', 'strict', b'abc'),
3084 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003085 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003086 ('\xff', 'strict', None),
3087 ('[\xff]', 'ignore', b'[]'),
3088 ('[\xff]', 'replace', b'[y]'),
3089 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003090 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02003091 ('[\xff]', 'namereplace',
3092 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003093 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003094 ('\udcff', 'strict', None),
3095 ('[\udcff]', 'surrogateescape', b'[\xff]'),
3096 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003097 ))
Victor Stinner9e921882011-10-18 21:55:25 +02003098 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02003099 (b'abc', 'strict', 'abc'),
3100 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
3101 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003102 (b'[\xff]', 'strict', None),
3103 (b'[\xff]', 'ignore', '[]'),
3104 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02003105 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003106 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003107 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003108 (b'\x81\x00abc', 'strict', None),
3109 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003110 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01003111 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003112 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003113
3114 def test_cp1252(self):
3115 self.check_encode(1252, (
3116 ('abc', 'strict', b'abc'),
3117 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
3118 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003119 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003120 ('\u0141', 'strict', None),
3121 ('\u0141', 'ignore', b''),
3122 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003123 ('\udc98', 'surrogateescape', b'\x98'),
3124 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003125 ))
3126 self.check_decode(1252, (
3127 (b'abc', 'strict', 'abc'),
3128 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3129 (b'\xff', 'strict', '\xff'),
3130 ))
3131
3132 def test_cp_utf7(self):
3133 cp = 65000
3134 self.check_encode(cp, (
3135 ('abc', 'strict', b'abc'),
3136 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
3137 ('\U0010ffff', 'strict', b'+2//f/w-'),
3138 ('\udc80', 'strict', b'+3IA-'),
3139 ('\ufffd', 'strict', b'+//0-'),
3140 ))
3141 self.check_decode(cp, (
3142 (b'abc', 'strict', 'abc'),
3143 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3144 (b'+2//f/w-', 'strict', '\U0010ffff'),
3145 (b'+3IA-', 'strict', '\udc80'),
3146 (b'+//0-', 'strict', '\ufffd'),
3147 # invalid bytes
3148 (b'[+/]', 'strict', '[]'),
3149 (b'[\xff]', 'strict', '[\xff]'),
3150 ))
3151
Victor Stinner3a50e702011-10-18 21:21:00 +02003152 def test_multibyte_encoding(self):
3153 self.check_decode(932, (
3154 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3155 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3156 ))
3157 self.check_decode(self.CP_UTF8, (
3158 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3159 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3160 ))
Steve Dowerf5aba582016-09-06 19:42:27 -07003161 self.check_encode(self.CP_UTF8, (
3162 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3163 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3164 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003165
3166 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003167 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3168 self.assertEqual(decoded, ('', 0))
3169
Victor Stinner3a50e702011-10-18 21:21:00 +02003170 decoded = codecs.code_page_decode(932,
3171 b'\xe9\x80\xe9', 'strict',
3172 False)
3173 self.assertEqual(decoded, ('\u9a3e', 2))
3174
3175 decoded = codecs.code_page_decode(932,
3176 b'\xe9\x80\xe9\x80', 'strict',
3177 False)
3178 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3179
3180 decoded = codecs.code_page_decode(932,
3181 b'abc', 'strict',
3182 False)
3183 self.assertEqual(decoded, ('abc', 3))
3184
Steve Dowerf5aba582016-09-06 19:42:27 -07003185 def test_mbcs_alias(self):
3186 # Check that looking up our 'default' codepage will return
3187 # mbcs when we don't have a more specific one available
Victor Stinner91106cd2017-12-13 12:29:09 +01003188 with mock.patch('_winapi.GetACP', return_value=123):
Steve Dowerf5aba582016-09-06 19:42:27 -07003189 codec = codecs.lookup('cp123')
3190 self.assertEqual(codec.name, 'mbcs')
Steve Dowerf5aba582016-09-06 19:42:27 -07003191
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003192 @support.bigmemtest(size=2**31, memuse=7, dry_run=False)
3193 def test_large_input(self):
3194 # Test input longer than INT_MAX.
3195 # Input should contain undecodable bytes before and after
3196 # the INT_MAX limit.
3197 encoded = (b'01234567' * (2**28-1) +
3198 b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff')
3199 self.assertEqual(len(encoded), 2**31+2)
3200 decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True)
3201 self.assertEqual(decoded[1], len(encoded))
3202 del encoded
3203 self.assertEqual(len(decoded[0]), decoded[1])
3204 self.assertEqual(decoded[0][:10], '0123456701')
3205 self.assertEqual(decoded[0][-20:],
3206 '6701234567'
3207 '\udc85\udc86\udcea\udceb\udcec'
3208 '\udcef\udcfc\udcfd\udcfe\udcff')
3209
Victor Stinner3a50e702011-10-18 21:21:00 +02003210
Victor Stinnerf96418d2015-09-21 23:06:27 +02003211class ASCIITest(unittest.TestCase):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003212 def test_encode(self):
3213 self.assertEqual('abc123'.encode('ascii'), b'abc123')
3214
3215 def test_encode_error(self):
3216 for data, error_handler, expected in (
3217 ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3218 ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3219 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003220 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3221 b'[\\x80\\xff\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003222 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3223 ):
3224 with self.subTest(data=data, error_handler=error_handler,
3225 expected=expected):
3226 self.assertEqual(data.encode('ascii', error_handler),
3227 expected)
3228
3229 def test_encode_surrogateescape_error(self):
3230 with self.assertRaises(UnicodeEncodeError):
3231 # the first character can be decoded, but not the second
3232 '\udc80\xff'.encode('ascii', 'surrogateescape')
3233
Victor Stinnerf96418d2015-09-21 23:06:27 +02003234 def test_decode(self):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003235 self.assertEqual(b'abc'.decode('ascii'), 'abc')
3236
3237 def test_decode_error(self):
Victor Stinnerf96418d2015-09-21 23:06:27 +02003238 for data, error_handler, expected in (
3239 (b'[\x80\xff]', 'ignore', '[]'),
3240 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3241 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3242 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3243 ):
3244 with self.subTest(data=data, error_handler=error_handler,
3245 expected=expected):
3246 self.assertEqual(data.decode('ascii', error_handler),
3247 expected)
3248
3249
Victor Stinnerc3713e92015-09-29 12:32:13 +02003250class Latin1Test(unittest.TestCase):
3251 def test_encode(self):
3252 for data, expected in (
3253 ('abc', b'abc'),
3254 ('\x80\xe9\xff', b'\x80\xe9\xff'),
3255 ):
3256 with self.subTest(data=data, expected=expected):
3257 self.assertEqual(data.encode('latin1'), expected)
3258
3259 def test_encode_errors(self):
3260 for data, error_handler, expected in (
3261 ('[\u20ac\udc80]', 'ignore', b'[]'),
3262 ('[\u20ac\udc80]', 'replace', b'[??]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003263 ('[\u20ac\U000abcde]', 'backslashreplace',
3264 b'[\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003265 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3266 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3267 ):
3268 with self.subTest(data=data, error_handler=error_handler,
3269 expected=expected):
3270 self.assertEqual(data.encode('latin1', error_handler),
3271 expected)
3272
3273 def test_encode_surrogateescape_error(self):
3274 with self.assertRaises(UnicodeEncodeError):
3275 # the first character can be decoded, but not the second
3276 '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3277
3278 def test_decode(self):
3279 for data, expected in (
3280 (b'abc', 'abc'),
3281 (b'[\x80\xff]', '[\x80\xff]'),
3282 ):
3283 with self.subTest(data=data, expected=expected):
3284 self.assertEqual(data.decode('latin1'), expected)
3285
3286
Victor Stinner3d4226a2018-08-29 22:21:32 +02003287@unittest.skipIf(_testcapi is None, 'need _testcapi module')
3288class LocaleCodecTest(unittest.TestCase):
3289 """
3290 Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex().
3291 """
3292 ENCODING = sys.getfilesystemencoding()
3293 STRINGS = ("ascii", "ulatin1:\xa7\xe9",
3294 "u255:\xff",
3295 "UCS:\xe9\u20ac\U0010ffff",
3296 "surrogates:\uDC80\uDCFF")
3297 BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff")
3298 SURROGATES = "\uDC80\uDCFF"
3299
3300 def encode(self, text, errors="strict"):
3301 return _testcapi.EncodeLocaleEx(text, 0, errors)
3302
3303 def check_encode_strings(self, errors):
3304 for text in self.STRINGS:
3305 with self.subTest(text=text):
3306 try:
3307 expected = text.encode(self.ENCODING, errors)
3308 except UnicodeEncodeError:
3309 with self.assertRaises(RuntimeError) as cm:
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003310 self.encode(text, errors)
Victor Stinner3d4226a2018-08-29 22:21:32 +02003311 errmsg = str(cm.exception)
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003312 self.assertRegex(errmsg, r"encode error: pos=[0-9]+, reason=")
Victor Stinner3d4226a2018-08-29 22:21:32 +02003313 else:
3314 encoded = self.encode(text, errors)
3315 self.assertEqual(encoded, expected)
3316
3317 def test_encode_strict(self):
3318 self.check_encode_strings("strict")
3319
3320 def test_encode_surrogateescape(self):
3321 self.check_encode_strings("surrogateescape")
3322
3323 def test_encode_surrogatepass(self):
3324 try:
3325 self.encode('', 'surrogatepass')
3326 except ValueError as exc:
3327 if str(exc) == 'unsupported error handler':
3328 self.skipTest(f"{self.ENCODING!r} encoder doesn't support "
3329 f"surrogatepass error handler")
3330 else:
3331 raise
3332
3333 self.check_encode_strings("surrogatepass")
3334
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003335 def test_encode_unsupported_error_handler(self):
3336 with self.assertRaises(ValueError) as cm:
3337 self.encode('', 'backslashreplace')
3338 self.assertEqual(str(cm.exception), 'unsupported error handler')
3339
Victor Stinner3d4226a2018-08-29 22:21:32 +02003340 def decode(self, encoded, errors="strict"):
3341 return _testcapi.DecodeLocaleEx(encoded, 0, errors)
3342
3343 def check_decode_strings(self, errors):
3344 is_utf8 = (self.ENCODING == "utf-8")
3345 if is_utf8:
3346 encode_errors = 'surrogateescape'
3347 else:
3348 encode_errors = 'strict'
3349
3350 strings = list(self.BYTES_STRINGS)
3351 for text in self.STRINGS:
3352 try:
3353 encoded = text.encode(self.ENCODING, encode_errors)
3354 if encoded not in strings:
3355 strings.append(encoded)
3356 except UnicodeEncodeError:
3357 encoded = None
3358
3359 if is_utf8:
3360 encoded2 = text.encode(self.ENCODING, 'surrogatepass')
3361 if encoded2 != encoded:
3362 strings.append(encoded2)
3363
3364 for encoded in strings:
3365 with self.subTest(encoded=encoded):
3366 try:
3367 expected = encoded.decode(self.ENCODING, errors)
3368 except UnicodeDecodeError:
3369 with self.assertRaises(RuntimeError) as cm:
3370 self.decode(encoded, errors)
3371 errmsg = str(cm.exception)
3372 self.assertTrue(errmsg.startswith("decode error: "), errmsg)
3373 else:
3374 decoded = self.decode(encoded, errors)
3375 self.assertEqual(decoded, expected)
3376
3377 def test_decode_strict(self):
3378 self.check_decode_strings("strict")
3379
3380 def test_decode_surrogateescape(self):
3381 self.check_decode_strings("surrogateescape")
3382
3383 def test_decode_surrogatepass(self):
3384 try:
3385 self.decode(b'', 'surrogatepass')
3386 except ValueError as exc:
3387 if str(exc) == 'unsupported error handler':
3388 self.skipTest(f"{self.ENCODING!r} decoder doesn't support "
3389 f"surrogatepass error handler")
3390 else:
3391 raise
3392
3393 self.check_decode_strings("surrogatepass")
3394
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003395 def test_decode_unsupported_error_handler(self):
3396 with self.assertRaises(ValueError) as cm:
3397 self.decode(b'', 'backslashreplace')
3398 self.assertEqual(str(cm.exception), 'unsupported error handler')
3399
Victor Stinner3d4226a2018-08-29 22:21:32 +02003400
Fred Drake2e2be372001-09-20 21:33:42 +00003401if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003402 unittest.main()