blob: 328a47b2e376693d80d24d813abf9f282d3ab3b5 [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
Nick Coghlanc72e4e62013-11-22 22:39:36 +10007import encodings
Victor Stinner91106cd2017-12-13 12:29:09 +01008from unittest import mock
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Hai Shi46605972020-08-04 00:49:18 +080011from test.support import os_helper
12from test.support import warnings_helper
Victor Stinner182d90d2011-09-29 19:53:55 +020013
Antoine Pitrou00b2c862011-10-05 13:01:41 +020014try:
Victor Stinner3d4226a2018-08-29 22:21:32 +020015 import _testcapi
Pablo Galindo293dd232019-11-19 21:34:03 +000016except ImportError:
Victor Stinner3d4226a2018-08-29 22:21:32 +020017 _testcapi = None
18
19try:
Antoine Pitrou00b2c862011-10-05 13:01:41 +020020 import ctypes
21except ImportError:
22 ctypes = None
23 SIZEOF_WCHAR_T = -1
24else:
25 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000026
Serhiy Storchakad6793772013-01-29 10:20:44 +020027def coding_checker(self, coder):
28 def check(input, expect):
29 self.assertEqual(coder(input), (expect, len(input)))
30 return check
31
Paul Monson62dfd7d2019-04-25 11:36:45 -070032# On small versions of Windows like Windows IoT or Windows Nano Server not all codepages are present
33def is_code_page_present(cp):
Victor Stinner8f4ef3b2019-07-01 18:28:25 +020034 from ctypes import POINTER, WINFUNCTYPE, WinDLL
Paul Monson62dfd7d2019-04-25 11:36:45 -070035 from ctypes.wintypes import BOOL, UINT, BYTE, WCHAR, UINT, DWORD
36
37 MAX_LEADBYTES = 12 # 5 ranges, 2 bytes ea., 0 term.
38 MAX_DEFAULTCHAR = 2 # single or double byte
39 MAX_PATH = 260
40 class CPINFOEXW(ctypes.Structure):
41 _fields_ = [("MaxCharSize", UINT),
42 ("DefaultChar", BYTE*MAX_DEFAULTCHAR),
43 ("LeadByte", BYTE*MAX_LEADBYTES),
44 ("UnicodeDefaultChar", WCHAR),
45 ("CodePage", UINT),
46 ("CodePageName", WCHAR*MAX_PATH)]
47
48 prototype = WINFUNCTYPE(BOOL, UINT, DWORD, POINTER(CPINFOEXW))
49 GetCPInfoEx = prototype(("GetCPInfoExW", WinDLL("kernel32")))
50 info = CPINFOEXW()
51 return GetCPInfoEx(cp, 0, info)
Victor Stinnerf96418d2015-09-21 23:06:27 +020052
Walter Dörwald69652032004-09-07 20:24:22 +000053class Queue(object):
54 """
55 queue: write bytes at one end, read bytes from the other end
56 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000057 def __init__(self, buffer):
58 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000059
60 def write(self, chars):
61 self._buffer += chars
62
63 def read(self, size=-1):
64 if size<0:
65 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000066 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000067 return s
68 else:
69 s = self._buffer[:size]
70 self._buffer = self._buffer[size:]
71 return s
72
Victor Stinnerf96418d2015-09-21 23:06:27 +020073
Walter Dörwald3abcb012007-04-16 22:10:50 +000074class MixInCheckStateHandling:
75 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000076 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000077 d = codecs.getincrementaldecoder(encoding)()
78 part1 = d.decode(s[:i])
79 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000080 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000081 # Check that the condition stated in the documentation for
82 # IncrementalDecoder.getstate() holds
83 if not state[1]:
84 # reset decoder to the default state without anything buffered
85 d.setstate((state[0][:0], 0))
86 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000087 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000088 # The decoder must return to the same state
89 self.assertEqual(state, d.getstate())
90 # Create a new decoder and set it to the state
91 # we extracted from the old one
92 d = codecs.getincrementaldecoder(encoding)()
93 d.setstate(state)
94 part2 = d.decode(s[i:], True)
95 self.assertEqual(u, part1+part2)
96
97 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000098 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000099 d = codecs.getincrementalencoder(encoding)()
100 part1 = d.encode(u[:i])
101 state = d.getstate()
102 d = codecs.getincrementalencoder(encoding)()
103 d.setstate(state)
104 part2 = d.encode(u[i:], True)
105 self.assertEqual(s, part1+part2)
106
Victor Stinnerf96418d2015-09-21 23:06:27 +0200107
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200108class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000109 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +0000110 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000111 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +0000112 # the StreamReader and check that the results equal the appropriate
113 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000114 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200115 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000116 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000117 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000118 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +0000119 result += r.read()
120 self.assertEqual(result, partialresult)
121 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000122 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000123 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +0000124
Martin Panter7462b6492015-11-02 03:37:02 +0000125 # do the check again, this time using an incremental decoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000126 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000127 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000128 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000129 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000130 self.assertEqual(result, partialresult)
131 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000132 self.assertEqual(d.decode(b"", True), "")
133 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000134
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000135 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000136 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000137 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000138 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000139 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000140 self.assertEqual(result, partialresult)
141 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000142 self.assertEqual(d.decode(b"", True), "")
143 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000144
145 # check iterdecode()
146 encoded = input.encode(self.encoding)
147 self.assertEqual(
148 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000149 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000150 )
151
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000152 def test_readline(self):
153 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000154 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000155 return codecs.getreader(self.encoding)(stream)
156
Walter Dörwaldca199432006-03-06 22:39:12 +0000157 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200158 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000159 lines = []
160 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000161 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000162 if not line:
163 break
164 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000165 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000166
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000167 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
168 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
169 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000170 self.assertEqual(readalllines(s, True), sexpected)
171 self.assertEqual(readalllines(s, False), sexpectednoends)
172 self.assertEqual(readalllines(s, True, 10), sexpected)
173 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000174
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200175 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000176 # Test long lines (multiple calls to read() in readline())
177 vw = []
178 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200179 for (i, lineend) in enumerate(lineends):
180 vw.append((i*200+200)*"\u3042" + lineend)
181 vwo.append((i*200+200)*"\u3042")
182 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
183 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000184
185 # Test lines where the first read might end with \r, so the
186 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000187 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200188 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000189 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000190 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000191 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000192 self.assertEqual(
193 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000194 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000195 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200196 self.assertEqual(
197 reader.readline(keepends=True),
198 "xxx\n",
199 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000200 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000201 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000202 self.assertEqual(
203 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000204 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000205 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200206 self.assertEqual(
207 reader.readline(keepends=False),
208 "xxx",
209 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000210
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200211 def test_mixed_readline_and_read(self):
212 lines = ["Humpty Dumpty sat on a wall,\n",
213 "Humpty Dumpty had a great fall.\r\n",
214 "All the king's horses and all the king's men\r",
215 "Couldn't put Humpty together again."]
216 data = ''.join(lines)
217 def getreader():
218 stream = io.BytesIO(data.encode(self.encoding))
219 return codecs.getreader(self.encoding)(stream)
220
221 # Issue #8260: Test readline() followed by read()
222 f = getreader()
223 self.assertEqual(f.readline(), lines[0])
224 self.assertEqual(f.read(), ''.join(lines[1:]))
225 self.assertEqual(f.read(), '')
226
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200227 # Issue #32110: Test readline() followed by read(n)
228 f = getreader()
229 self.assertEqual(f.readline(), lines[0])
230 self.assertEqual(f.read(1), lines[1][0])
231 self.assertEqual(f.read(0), '')
232 self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100])
233
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200234 # Issue #16636: Test readline() followed by readlines()
235 f = getreader()
236 self.assertEqual(f.readline(), lines[0])
237 self.assertEqual(f.readlines(), lines[1:])
238 self.assertEqual(f.read(), '')
239
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200240 # Test read(n) followed by read()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200241 f = getreader()
242 self.assertEqual(f.read(size=40, chars=5), data[:5])
243 self.assertEqual(f.read(), data[5:])
244 self.assertEqual(f.read(), '')
245
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200246 # Issue #32110: Test read(n) followed by read(n)
247 f = getreader()
248 self.assertEqual(f.read(size=40, chars=5), data[:5])
249 self.assertEqual(f.read(1), data[5])
250 self.assertEqual(f.read(0), '')
251 self.assertEqual(f.read(100), data[6:106])
252
253 # Issue #12446: Test read(n) followed by readlines()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200254 f = getreader()
255 self.assertEqual(f.read(size=40, chars=5), data[:5])
256 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
257 self.assertEqual(f.read(), '')
258
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000259 def test_bug1175396(self):
260 s = [
261 '<%!--===================================================\r\n',
262 ' BLOG index page: show recent articles,\r\n',
263 ' today\'s articles, or articles of a specific date.\r\n',
264 '========================================================--%>\r\n',
265 '<%@inputencoding="ISO-8859-1"%>\r\n',
266 '<%@pagetemplate=TEMPLATE.y%>\r\n',
267 '<%@import=import frog.util, frog%>\r\n',
268 '<%@import=import frog.objects%>\r\n',
269 '<%@import=from frog.storageerrors import StorageError%>\r\n',
270 '<%\r\n',
271 '\r\n',
272 'import logging\r\n',
273 'log=logging.getLogger("Snakelets.logger")\r\n',
274 '\r\n',
275 '\r\n',
276 'user=self.SessionCtx.user\r\n',
277 'storageEngine=self.SessionCtx.storageEngine\r\n',
278 '\r\n',
279 '\r\n',
280 'def readArticlesFromDate(date, count=None):\r\n',
281 ' entryids=storageEngine.listBlogEntries(date)\r\n',
282 ' entryids.reverse() # descending\r\n',
283 ' if count:\r\n',
284 ' entryids=entryids[:count]\r\n',
285 ' try:\r\n',
286 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
287 ' except StorageError,x:\r\n',
288 ' log.error("Error loading articles: "+str(x))\r\n',
289 ' self.abort("cannot load articles")\r\n',
290 '\r\n',
291 'showdate=None\r\n',
292 '\r\n',
293 'arg=self.Request.getArg()\r\n',
294 'if arg=="today":\r\n',
295 ' #-------------------- TODAY\'S ARTICLES\r\n',
296 ' self.write("<h2>Today\'s articles</h2>")\r\n',
297 ' showdate = frog.util.isodatestr() \r\n',
298 ' entries = readArticlesFromDate(showdate)\r\n',
299 'elif arg=="active":\r\n',
300 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
301 ' self.Yredirect("active.y")\r\n',
302 'elif arg=="login":\r\n',
303 ' #-------------------- LOGIN PAGE redirect\r\n',
304 ' self.Yredirect("login.y")\r\n',
305 'elif arg=="date":\r\n',
306 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
307 ' showdate = self.Request.getParameter("date")\r\n',
308 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
309 ' entries = readArticlesFromDate(showdate)\r\n',
310 'else:\r\n',
311 ' #-------------------- RECENT ARTICLES\r\n',
312 ' self.write("<h2>Recent articles</h2>")\r\n',
313 ' dates=storageEngine.listBlogEntryDates()\r\n',
314 ' if dates:\r\n',
315 ' entries=[]\r\n',
316 ' SHOWAMOUNT=10\r\n',
317 ' for showdate in dates:\r\n',
318 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
319 ' if len(entries)>=SHOWAMOUNT:\r\n',
320 ' break\r\n',
321 ' \r\n',
322 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000323 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200324 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000325 for (i, line) in enumerate(reader):
326 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000327
328 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000329 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200330 writer = codecs.getwriter(self.encoding)(q)
331 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000332
333 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000334 writer.write("foo\r")
335 self.assertEqual(reader.readline(keepends=False), "foo")
336 writer.write("\nbar\r")
337 self.assertEqual(reader.readline(keepends=False), "")
338 self.assertEqual(reader.readline(keepends=False), "bar")
339 writer.write("baz")
340 self.assertEqual(reader.readline(keepends=False), "baz")
341 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000342
343 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000344 writer.write("foo\r")
345 self.assertEqual(reader.readline(keepends=True), "foo\r")
346 writer.write("\nbar\r")
347 self.assertEqual(reader.readline(keepends=True), "\n")
348 self.assertEqual(reader.readline(keepends=True), "bar\r")
349 writer.write("baz")
350 self.assertEqual(reader.readline(keepends=True), "baz")
351 self.assertEqual(reader.readline(keepends=True), "")
352 writer.write("foo\r\n")
353 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000354
Walter Dörwald9fa09462005-01-10 12:01:39 +0000355 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000356 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
357 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
358 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000359
360 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000361 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200362 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000363 self.assertEqual(reader.readline(), s1)
364 self.assertEqual(reader.readline(), s2)
365 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000366 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000367
368 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000369 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
370 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
371 s3 = "stillokay:bbbbxx\r\n"
372 s4 = "broken!!!!badbad\r\n"
373 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000374
375 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000376 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200377 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000378 self.assertEqual(reader.readline(), s1)
379 self.assertEqual(reader.readline(), s2)
380 self.assertEqual(reader.readline(), s3)
381 self.assertEqual(reader.readline(), s4)
382 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000383 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000384
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200385 ill_formed_sequence_replace = "\ufffd"
386
387 def test_lone_surrogates(self):
388 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
389 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
390 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200391 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
392 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200393 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
394 "[&#56448;]".encode(self.encoding))
395 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
396 "[]".encode(self.encoding))
397 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
398 "[?]".encode(self.encoding))
399
Victor Stinner01ada392015-10-01 21:54:51 +0200400 # sequential surrogate characters
401 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
402 "[]".encode(self.encoding))
403 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
404 "[??]".encode(self.encoding))
405
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200406 bom = "".encode(self.encoding)
407 for before, after in [("\U00010fff", "A"), ("[", "]"),
408 ("A", "\U00010fff")]:
409 before_sequence = before.encode(self.encoding)[len(bom):]
410 after_sequence = after.encode(self.encoding)[len(bom):]
411 test_string = before + "\uDC80" + after
412 test_sequence = (bom + before_sequence +
413 self.ill_formed_sequence + after_sequence)
414 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
415 self.encoding)
416 self.assertEqual(test_string.encode(self.encoding,
417 "surrogatepass"),
418 test_sequence)
419 self.assertEqual(test_sequence.decode(self.encoding,
420 "surrogatepass"),
421 test_string)
422 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
423 before + after)
424 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
425 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200426 backslashreplace = ''.join('\\x%02x' % b
427 for b in self.ill_formed_sequence)
428 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
429 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200430
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +0200431 def test_incremental_surrogatepass(self):
432 # Test incremental decoder for surrogatepass handler:
433 # see issue #24214
Serhiy Storchaka894263b2019-06-25 11:54:18 +0300434 # High surrogate
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +0200435 data = '\uD901'.encode(self.encoding, 'surrogatepass')
436 for i in range(1, len(data)):
437 dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
438 self.assertEqual(dec.decode(data[:i]), '')
439 self.assertEqual(dec.decode(data[i:], True), '\uD901')
Serhiy Storchaka894263b2019-06-25 11:54:18 +0300440 # Low surrogate
441 data = '\uDC02'.encode(self.encoding, 'surrogatepass')
442 for i in range(1, len(data)):
443 dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
444 self.assertEqual(dec.decode(data[:i]), '')
445 self.assertEqual(dec.decode(data[i:]), '\uDC02')
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +0200446
Victor Stinnerf96418d2015-09-21 23:06:27 +0200447
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200448class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000449 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200450 if sys.byteorder == 'little':
451 ill_formed_sequence = b"\x80\xdc\x00\x00"
452 else:
453 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000454
455 spamle = (b'\xff\xfe\x00\x00'
456 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
457 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
458 spambe = (b'\x00\x00\xfe\xff'
459 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
460 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
461
462 def test_only_one_bom(self):
463 _,_,reader,writer = codecs.lookup(self.encoding)
464 # encode some stream
465 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200466 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000467 f.write("spam")
468 f.write("spam")
469 d = s.getvalue()
470 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000471 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000472 # try to read it back
473 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200474 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000475 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000476
477 def test_badbom(self):
478 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200479 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000480 self.assertRaises(UnicodeError, f.read)
481
482 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200483 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000484 self.assertRaises(UnicodeError, f.read)
485
486 def test_partial(self):
487 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200488 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000489 [
490 "", # first byte of BOM read
491 "", # second byte of BOM read
492 "", # third byte of BOM read
493 "", # fourth byte of BOM read => byteorder known
494 "",
495 "",
496 "",
497 "\x00",
498 "\x00",
499 "\x00",
500 "\x00",
501 "\x00\xff",
502 "\x00\xff",
503 "\x00\xff",
504 "\x00\xff",
505 "\x00\xff\u0100",
506 "\x00\xff\u0100",
507 "\x00\xff\u0100",
508 "\x00\xff\u0100",
509 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200510 "\x00\xff\u0100\uffff",
511 "\x00\xff\u0100\uffff",
512 "\x00\xff\u0100\uffff",
513 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000514 ]
515 )
516
Georg Brandl791f4e12009-09-17 11:41:24 +0000517 def test_handlers(self):
518 self.assertEqual(('\ufffd', 1),
519 codecs.utf_32_decode(b'\x01', 'replace', True))
520 self.assertEqual(('', 1),
521 codecs.utf_32_decode(b'\x01', 'ignore', True))
522
Walter Dörwald41980ca2007-08-16 21:55:45 +0000523 def test_errors(self):
524 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
525 b"\xff", "strict", True)
526
527 def test_decoder_state(self):
528 self.check_state_handling_decode(self.encoding,
529 "spamspam", self.spamle)
530 self.check_state_handling_decode(self.encoding,
531 "spamspam", self.spambe)
532
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000533 def test_issue8941(self):
534 # Issue #8941: insufficient result allocation when decoding into
535 # surrogate pairs on UCS-2 builds.
536 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
537 self.assertEqual('\U00010000' * 1024,
538 codecs.utf_32_decode(encoded_le)[0])
539 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
540 self.assertEqual('\U00010000' * 1024,
541 codecs.utf_32_decode(encoded_be)[0])
542
Victor Stinnerf96418d2015-09-21 23:06:27 +0200543
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200544class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000545 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200546 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000547
548 def test_partial(self):
549 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200550 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000551 [
552 "",
553 "",
554 "",
555 "\x00",
556 "\x00",
557 "\x00",
558 "\x00",
559 "\x00\xff",
560 "\x00\xff",
561 "\x00\xff",
562 "\x00\xff",
563 "\x00\xff\u0100",
564 "\x00\xff\u0100",
565 "\x00\xff\u0100",
566 "\x00\xff\u0100",
567 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200568 "\x00\xff\u0100\uffff",
569 "\x00\xff\u0100\uffff",
570 "\x00\xff\u0100\uffff",
571 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000572 ]
573 )
574
575 def test_simple(self):
576 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
577
578 def test_errors(self):
579 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
580 b"\xff", "strict", True)
581
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000582 def test_issue8941(self):
583 # Issue #8941: insufficient result allocation when decoding into
584 # surrogate pairs on UCS-2 builds.
585 encoded = b'\x00\x00\x01\x00' * 1024
586 self.assertEqual('\U00010000' * 1024,
587 codecs.utf_32_le_decode(encoded)[0])
588
Victor Stinnerf96418d2015-09-21 23:06:27 +0200589
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200590class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000591 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200592 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000593
594 def test_partial(self):
595 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200596 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000597 [
598 "",
599 "",
600 "",
601 "\x00",
602 "\x00",
603 "\x00",
604 "\x00",
605 "\x00\xff",
606 "\x00\xff",
607 "\x00\xff",
608 "\x00\xff",
609 "\x00\xff\u0100",
610 "\x00\xff\u0100",
611 "\x00\xff\u0100",
612 "\x00\xff\u0100",
613 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200614 "\x00\xff\u0100\uffff",
615 "\x00\xff\u0100\uffff",
616 "\x00\xff\u0100\uffff",
617 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000618 ]
619 )
620
621 def test_simple(self):
622 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
623
624 def test_errors(self):
625 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
626 b"\xff", "strict", True)
627
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000628 def test_issue8941(self):
629 # Issue #8941: insufficient result allocation when decoding into
630 # surrogate pairs on UCS-2 builds.
631 encoded = b'\x00\x01\x00\x00' * 1024
632 self.assertEqual('\U00010000' * 1024,
633 codecs.utf_32_be_decode(encoded)[0])
634
635
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200636class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000637 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200638 if sys.byteorder == 'little':
639 ill_formed_sequence = b"\x80\xdc"
640 else:
641 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000642
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000643 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
644 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000645
646 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000647 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000648 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000649 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200650 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000651 f.write("spam")
652 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000653 d = s.getvalue()
654 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000655 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000656 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000657 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200658 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000659 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000660
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000661 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000662 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200663 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000664 self.assertRaises(UnicodeError, f.read)
665
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000666 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200667 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000668 self.assertRaises(UnicodeError, f.read)
669
Walter Dörwald69652032004-09-07 20:24:22 +0000670 def test_partial(self):
671 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200672 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000673 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000674 "", # first byte of BOM read
675 "", # second byte of BOM read => byteorder known
676 "",
677 "\x00",
678 "\x00",
679 "\x00\xff",
680 "\x00\xff",
681 "\x00\xff\u0100",
682 "\x00\xff\u0100",
683 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200684 "\x00\xff\u0100\uffff",
685 "\x00\xff\u0100\uffff",
686 "\x00\xff\u0100\uffff",
687 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000688 ]
689 )
690
Georg Brandl791f4e12009-09-17 11:41:24 +0000691 def test_handlers(self):
692 self.assertEqual(('\ufffd', 1),
693 codecs.utf_16_decode(b'\x01', 'replace', True))
694 self.assertEqual(('', 1),
695 codecs.utf_16_decode(b'\x01', 'ignore', True))
696
Walter Dörwalde22d3392005-11-17 08:52:34 +0000697 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000698 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000699 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000700
701 def test_decoder_state(self):
702 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000703 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000704 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000705 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000706
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000707 def test_bug691291(self):
708 # Files are always opened in binary mode, even if no binary mode was
709 # specified. This means that no automatic conversion of '\n' is done
710 # on reading and writing.
711 s1 = 'Hello\r\nworld\r\n'
712
713 s = s1.encode(self.encoding)
Hai Shi46605972020-08-04 00:49:18 +0800714 self.addCleanup(os_helper.unlink, os_helper.TESTFN)
715 with open(os_helper.TESTFN, 'wb') as fp:
Victor Stinner2cca0572011-05-23 14:51:42 +0200716 fp.write(s)
Hai Shi46605972020-08-04 00:49:18 +0800717 with warnings_helper.check_warnings(('', DeprecationWarning)):
718 reader = codecs.open(os_helper.TESTFN, 'U', encoding=self.encoding)
Victor Stinner942f7a22020-03-04 18:50:22 +0100719 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200720 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000721
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200722class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000723 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200724 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000725
726 def test_partial(self):
727 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200728 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000729 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000730 "",
731 "\x00",
732 "\x00",
733 "\x00\xff",
734 "\x00\xff",
735 "\x00\xff\u0100",
736 "\x00\xff\u0100",
737 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200738 "\x00\xff\u0100\uffff",
739 "\x00\xff\u0100\uffff",
740 "\x00\xff\u0100\uffff",
741 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000742 ]
743 )
744
Walter Dörwalde22d3392005-11-17 08:52:34 +0000745 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200746 tests = [
747 (b'\xff', '\ufffd'),
748 (b'A\x00Z', 'A\ufffd'),
749 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
750 (b'\x00\xd8', '\ufffd'),
751 (b'\x00\xd8A', '\ufffd'),
752 (b'\x00\xd8A\x00', '\ufffdA'),
753 (b'\x00\xdcA\x00', '\ufffdA'),
754 ]
755 for raw, expected in tests:
756 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
757 raw, 'strict', True)
758 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000759
Victor Stinner53a9dd72010-12-08 22:25:45 +0000760 def test_nonbmp(self):
761 self.assertEqual("\U00010203".encode(self.encoding),
762 b'\x00\xd8\x03\xde')
763 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
764 "\U00010203")
765
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200766class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000767 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200768 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000769
770 def test_partial(self):
771 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200772 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000773 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000774 "",
775 "\x00",
776 "\x00",
777 "\x00\xff",
778 "\x00\xff",
779 "\x00\xff\u0100",
780 "\x00\xff\u0100",
781 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200782 "\x00\xff\u0100\uffff",
783 "\x00\xff\u0100\uffff",
784 "\x00\xff\u0100\uffff",
785 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000786 ]
787 )
788
Walter Dörwalde22d3392005-11-17 08:52:34 +0000789 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200790 tests = [
791 (b'\xff', '\ufffd'),
792 (b'\x00A\xff', 'A\ufffd'),
793 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
794 (b'\xd8\x00', '\ufffd'),
795 (b'\xd8\x00\xdc', '\ufffd'),
796 (b'\xd8\x00\x00A', '\ufffdA'),
797 (b'\xdc\x00\x00A', '\ufffdA'),
798 ]
799 for raw, expected in tests:
800 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
801 raw, 'strict', True)
802 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000803
Victor Stinner53a9dd72010-12-08 22:25:45 +0000804 def test_nonbmp(self):
805 self.assertEqual("\U00010203".encode(self.encoding),
806 b'\xd8\x00\xde\x03')
807 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
808 "\U00010203")
809
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200810class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000811 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200812 ill_formed_sequence = b"\xed\xb2\x80"
813 ill_formed_sequence_replace = "\ufffd" * 3
Victor Stinner01ada392015-10-01 21:54:51 +0200814 BOM = b''
Walter Dörwald69652032004-09-07 20:24:22 +0000815
816 def test_partial(self):
817 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200818 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000819 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000820 "\x00",
821 "\x00",
822 "\x00\xff",
823 "\x00\xff",
824 "\x00\xff\u07ff",
825 "\x00\xff\u07ff",
826 "\x00\xff\u07ff",
827 "\x00\xff\u07ff\u0800",
828 "\x00\xff\u07ff\u0800",
829 "\x00\xff\u07ff\u0800",
830 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200831 "\x00\xff\u07ff\u0800\uffff",
832 "\x00\xff\u07ff\u0800\uffff",
833 "\x00\xff\u07ff\u0800\uffff",
834 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000835 ]
836 )
837
Walter Dörwald3abcb012007-04-16 22:10:50 +0000838 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000839 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000840 self.check_state_handling_decode(self.encoding,
841 u, u.encode(self.encoding))
842
Victor Stinner1d65d912015-10-05 13:43:50 +0200843 def test_decode_error(self):
844 for data, error_handler, expected in (
845 (b'[\x80\xff]', 'ignore', '[]'),
846 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
847 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
848 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
849 ):
850 with self.subTest(data=data, error_handler=error_handler,
851 expected=expected):
852 self.assertEqual(data.decode(self.encoding, error_handler),
853 expected)
854
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000855 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200856 super().test_lone_surrogates()
857 # not sure if this is making sense for
858 # UTF-16 and UTF-32
Victor Stinner01ada392015-10-01 21:54:51 +0200859 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
860 self.BOM + b'[\x80]')
861
862 with self.assertRaises(UnicodeEncodeError) as cm:
863 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
864 exc = cm.exception
865 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000866
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000867 def test_surrogatepass_handler(self):
Victor Stinner01ada392015-10-01 21:54:51 +0200868 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
869 self.BOM + b"abc\xed\xa0\x80def")
870 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
871 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
872 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
873 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
874
875 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
Ezio Melottib3aedd42010-11-20 19:04:17 +0000876 "abc\ud800def")
Victor Stinner01ada392015-10-01 21:54:51 +0200877 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200878 "\U00010fff\uD800")
Victor Stinner01ada392015-10-01 21:54:51 +0200879
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000880 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700881 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200882 b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200883 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200884 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000885
Serhiy Storchaka894263b2019-06-25 11:54:18 +0300886 def test_incremental_errors(self):
887 # Test that the incremental decoder can fail with final=False.
888 # See issue #24214
889 cases = [b'\x80', b'\xBF', b'\xC0', b'\xC1', b'\xF5', b'\xF6', b'\xFF']
890 for prefix in (b'\xC2', b'\xDF', b'\xE0', b'\xE0\xA0', b'\xEF',
891 b'\xEF\xBF', b'\xF0', b'\xF0\x90', b'\xF0\x90\x80',
892 b'\xF4', b'\xF4\x8F', b'\xF4\x8F\xBF'):
893 for suffix in b'\x7F', b'\xC0':
894 cases.append(prefix + suffix)
895 cases.extend((b'\xE0\x80', b'\xE0\x9F', b'\xED\xA0\x80',
896 b'\xED\xBF\xBF', b'\xF0\x80', b'\xF0\x8F', b'\xF4\x90'))
897
898 for data in cases:
899 with self.subTest(data=data):
900 dec = codecs.getincrementaldecoder(self.encoding)()
901 self.assertRaises(UnicodeDecodeError, dec.decode, data)
902
Victor Stinnerf96418d2015-09-21 23:06:27 +0200903
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200904class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000905 encoding = "utf-7"
906
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300907 def test_ascii(self):
908 # Set D (directly encoded characters)
909 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
910 'abcdefghijklmnopqrstuvwxyz'
911 '0123456789'
912 '\'(),-./:?')
913 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
914 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
915 # Set O (optional direct characters)
916 set_o = ' !"#$%&*;<=>@[]^_`{|}'
917 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
918 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
919 # +
920 self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
921 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
922 # White spaces
923 ws = ' \t\n\r'
924 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
925 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
926 # Other ASCII characters
927 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
928 set(set_d + set_o + '+' + ws)))
929 self.assertEqual(other_ascii.encode(self.encoding),
930 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
931 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
932
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000933 def test_partial(self):
934 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200935 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000936 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200937 'a',
938 'a',
939 'a+',
940 'a+-',
941 'a+-b',
942 'a+-b',
943 'a+-b',
944 'a+-b',
945 'a+-b',
946 'a+-b\x00',
947 'a+-b\x00c',
948 'a+-b\x00c',
949 'a+-b\x00c',
950 'a+-b\x00c',
951 'a+-b\x00c',
952 'a+-b\x00c\x80',
953 'a+-b\x00c\x80d',
954 'a+-b\x00c\x80d',
955 'a+-b\x00c\x80d',
956 'a+-b\x00c\x80d',
957 'a+-b\x00c\x80d',
958 'a+-b\x00c\x80d\u0100',
959 'a+-b\x00c\x80d\u0100e',
960 'a+-b\x00c\x80d\u0100e',
961 'a+-b\x00c\x80d\u0100e',
962 'a+-b\x00c\x80d\u0100e',
963 'a+-b\x00c\x80d\u0100e',
964 'a+-b\x00c\x80d\u0100e',
965 'a+-b\x00c\x80d\u0100e',
966 'a+-b\x00c\x80d\u0100e',
967 'a+-b\x00c\x80d\u0100e\U00010000',
968 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000969 ]
970 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000971
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300972 def test_errors(self):
973 tests = [
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300974 (b'\xffb', '\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300975 (b'a\xffb', 'a\ufffdb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300976 (b'a\xff\xffb', 'a\ufffd\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300977 (b'a+IK', 'a\ufffd'),
978 (b'a+IK-b', 'a\ufffdb'),
979 (b'a+IK,b', 'a\ufffdb'),
980 (b'a+IKx', 'a\u20ac\ufffd'),
981 (b'a+IKx-b', 'a\u20ac\ufffdb'),
982 (b'a+IKwgr', 'a\u20ac\ufffd'),
983 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
984 (b'a+IKwgr,', 'a\u20ac\ufffd'),
985 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
986 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
987 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
988 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
989 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
990 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
991 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300992 (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
993 (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
Zackery Spytze349bf22018-08-18 22:43:38 -0600994 (b'a+@b', 'a\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300995 ]
996 for raw, expected in tests:
997 with self.subTest(raw=raw):
998 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
999 raw, 'strict', True)
1000 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
1001
1002 def test_nonbmp(self):
1003 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1004 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1005 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001006 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1007 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1008 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1009 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1010 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1011 b'+IKwgrNgB3KA-')
1012 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1013 '\u20ac\u20ac\U000104A0')
1014 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1015 '\u20ac\u20ac\U000104A0')
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001016
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001017 def test_lone_surrogates(self):
1018 tests = [
1019 (b'a+2AE-b', 'a\ud801b'),
1020 (b'a+2AE\xffb', 'a\ufffdb'),
1021 (b'a+2AE', 'a\ufffd'),
1022 (b'a+2AEA-b', 'a\ufffdb'),
1023 (b'a+2AH-b', 'a\ufffdb'),
1024 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1025 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1026 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1027 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1028 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1029 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1030 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1031 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1032 ]
1033 for raw, expected in tests:
1034 with self.subTest(raw=raw):
1035 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001036
1037
Walter Dörwalde22d3392005-11-17 08:52:34 +00001038class UTF16ExTest(unittest.TestCase):
1039
1040 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001041 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001042
1043 def test_bad_args(self):
1044 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1045
1046class ReadBufferTest(unittest.TestCase):
1047
1048 def test_array(self):
1049 import array
1050 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001051 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +00001052 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001053 )
1054
1055 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +00001056 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +00001057
1058 def test_bad_args(self):
1059 self.assertRaises(TypeError, codecs.readbuffer_encode)
1060 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1061
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001062class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001063 encoding = "utf-8-sig"
Victor Stinner01ada392015-10-01 21:54:51 +02001064 BOM = codecs.BOM_UTF8
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001065
1066 def test_partial(self):
1067 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001068 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001069 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001070 "",
1071 "",
1072 "", # First BOM has been read and skipped
1073 "",
1074 "",
1075 "\ufeff", # Second BOM has been read and emitted
1076 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001077 "\ufeff\x00", # First byte of encoded "\xff" read
1078 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1079 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1080 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001081 "\ufeff\x00\xff\u07ff",
1082 "\ufeff\x00\xff\u07ff",
1083 "\ufeff\x00\xff\u07ff\u0800",
1084 "\ufeff\x00\xff\u07ff\u0800",
1085 "\ufeff\x00\xff\u07ff\u0800",
1086 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001087 "\ufeff\x00\xff\u07ff\u0800\uffff",
1088 "\ufeff\x00\xff\u07ff\u0800\uffff",
1089 "\ufeff\x00\xff\u07ff\u0800\uffff",
1090 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001091 ]
1092 )
1093
Thomas Wouters89f507f2006-12-13 04:49:30 +00001094 def test_bug1601501(self):
1095 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001096 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001097
Walter Dörwald3abcb012007-04-16 22:10:50 +00001098 def test_bom(self):
1099 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001100 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001101 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1102
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001103 def test_stream_bom(self):
1104 unistring = "ABC\u00A1\u2200XYZ"
1105 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1106
1107 reader = codecs.getreader("utf-8-sig")
1108 for sizehint in [None] + list(range(1, 11)) + \
1109 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001110 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001111 ostream = io.StringIO()
1112 while 1:
1113 if sizehint is not None:
1114 data = istream.read(sizehint)
1115 else:
1116 data = istream.read()
1117
1118 if not data:
1119 break
1120 ostream.write(data)
1121
1122 got = ostream.getvalue()
1123 self.assertEqual(got, unistring)
1124
1125 def test_stream_bare(self):
1126 unistring = "ABC\u00A1\u2200XYZ"
1127 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1128
1129 reader = codecs.getreader("utf-8-sig")
1130 for sizehint in [None] + list(range(1, 11)) + \
1131 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001132 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001133 ostream = io.StringIO()
1134 while 1:
1135 if sizehint is not None:
1136 data = istream.read(sizehint)
1137 else:
1138 data = istream.read()
1139
1140 if not data:
1141 break
1142 ostream.write(data)
1143
1144 got = ostream.getvalue()
1145 self.assertEqual(got, unistring)
1146
Chris A2565ede2020-03-02 01:39:50 -05001147
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001148class EscapeDecodeTest(unittest.TestCase):
1149 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001150 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001151 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001152
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001153 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001154 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001155 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001156 b = bytes([b])
1157 if b != b'\\':
1158 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001159
1160 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001161 decode = codecs.escape_decode
1162 check = coding_checker(self, decode)
1163 check(b"[\\\n]", b"[]")
1164 check(br'[\"]', b'["]')
1165 check(br"[\']", b"[']")
R David Murray110b6fe2016-09-08 15:34:08 -04001166 check(br"[\\]", b"[\\]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001167 check(br"[\a]", b"[\x07]")
1168 check(br"[\b]", b"[\x08]")
1169 check(br"[\t]", b"[\x09]")
1170 check(br"[\n]", b"[\x0a]")
1171 check(br"[\v]", b"[\x0b]")
1172 check(br"[\f]", b"[\x0c]")
1173 check(br"[\r]", b"[\x0d]")
1174 check(br"[\7]", b"[\x07]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001175 check(br"[\78]", b"[\x078]")
1176 check(br"[\41]", b"[!]")
1177 check(br"[\418]", b"[!8]")
1178 check(br"[\101]", b"[A]")
1179 check(br"[\1010]", b"[A0]")
1180 check(br"[\501]", b"[A]")
1181 check(br"[\x41]", b"[A]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001182 check(br"[\x410]", b"[A0]")
R David Murray110b6fe2016-09-08 15:34:08 -04001183 for i in range(97, 123):
1184 b = bytes([i])
1185 if b not in b'abfnrtvx':
1186 with self.assertWarns(DeprecationWarning):
1187 check(b"\\" + b, b"\\" + b)
1188 with self.assertWarns(DeprecationWarning):
1189 check(b"\\" + b.upper(), b"\\" + b.upper())
1190 with self.assertWarns(DeprecationWarning):
1191 check(br"\8", b"\\8")
1192 with self.assertWarns(DeprecationWarning):
1193 check(br"\9", b"\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03001194 with self.assertWarns(DeprecationWarning):
1195 check(b"\\\xfa", b"\\\xfa")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001196
1197 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001198 decode = codecs.escape_decode
1199 self.assertRaises(ValueError, decode, br"\x")
1200 self.assertRaises(ValueError, decode, br"[\x]")
1201 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1202 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1203 self.assertRaises(ValueError, decode, br"\x0")
1204 self.assertRaises(ValueError, decode, br"[\x0]")
1205 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1206 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001207
Victor Stinnerf96418d2015-09-21 23:06:27 +02001208
Martin v. Löwis2548c732003-04-18 10:39:54 +00001209# From RFC 3492
1210punycode_testcases = [
1211 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001212 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1213 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001214 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001215 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001216 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001217 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001218 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001219 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001220 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001221 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001222 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1223 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1224 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001225 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001226 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001227 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1228 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1229 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001230 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001231 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001232 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001233 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1234 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1235 "\u0939\u0948\u0902",
1236 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001237
1238 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001239 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001240 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1241 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001242
1243 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001244 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1245 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1246 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001247 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1248 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001249
1250 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001251 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1252 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1253 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1254 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001255 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001256
1257 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001258 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1259 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1260 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1261 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1262 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001263 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001264
1265 # (K) Vietnamese:
1266 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1267 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001268 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1269 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1270 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1271 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001272 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001273
Martin v. Löwis2548c732003-04-18 10:39:54 +00001274 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001275 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001276 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001277
Martin v. Löwis2548c732003-04-18 10:39:54 +00001278 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001279 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1280 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1281 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001282 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001283
1284 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001285 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1286 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1287 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001288 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001289
1290 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001291 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001292 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001293
1294 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001295 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1296 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001297 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001298
1299 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001300 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001301 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001302
1303 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001304 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001305 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001306
1307 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001308 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1309 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001310 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001311 ]
1312
1313for i in punycode_testcases:
1314 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001315 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001316
Victor Stinnerf96418d2015-09-21 23:06:27 +02001317
Martin v. Löwis2548c732003-04-18 10:39:54 +00001318class PunycodeTest(unittest.TestCase):
1319 def test_encode(self):
1320 for uni, puny in punycode_testcases:
1321 # Need to convert both strings to lower case, since
1322 # some of the extended encodings use upper case, but our
1323 # code produces only lower case. Converting just puny to
1324 # lower is also insufficient, since some of the input characters
1325 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001326 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001327 str(uni.encode("punycode"), "ascii").lower(),
1328 str(puny, "ascii").lower()
1329 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001330
1331 def test_decode(self):
1332 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001333 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001334 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001335 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001336
Berker Peksagba22e8f2020-02-25 06:19:03 +03001337 def test_decode_invalid(self):
1338 testcases = [
1339 (b"xn--w&", "strict", UnicodeError()),
1340 (b"xn--w&", "ignore", "xn-"),
1341 ]
1342 for puny, errors, expected in testcases:
1343 with self.subTest(puny=puny, errors=errors):
1344 if isinstance(expected, Exception):
1345 self.assertRaises(UnicodeError, puny.decode, "punycode", errors)
1346 else:
1347 self.assertEqual(puny.decode("punycode", errors), expected)
1348
Victor Stinnerf96418d2015-09-21 23:06:27 +02001349
Martin v. Löwis2548c732003-04-18 10:39:54 +00001350# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1351nameprep_tests = [
1352 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001353 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1354 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1355 b'\xb8\x8f\xef\xbb\xbf',
1356 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001357 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001358 (b'CAFE',
1359 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001360 # 3.3 Case folding 8bit U+00DF (german sharp s).
1361 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001362 (b'\xc3\x9f',
1363 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001364 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001365 (b'\xc4\xb0',
1366 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001367 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001368 (b'\xc5\x83\xcd\xba',
1369 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001370 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1371 # XXX: skip this as it fails in UCS-2 mode
1372 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1373 # 'telc\xe2\x88\x95kg\xcf\x83'),
1374 (None, None),
1375 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001376 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1377 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001378 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001379 (b'\xe1\xbe\xb7',
1380 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001381 # 3.9 Self-reverting case folding U+01F0 and normalization.
1382 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001383 (b'\xc7\xb0',
1384 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001385 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001386 (b'\xce\x90',
1387 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001388 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001389 (b'\xce\xb0',
1390 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001391 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001392 (b'\xe1\xba\x96',
1393 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001394 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001395 (b'\xe1\xbd\x96',
1396 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001397 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001398 (b' ',
1399 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001400 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001401 (b'\xc2\xa0',
1402 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001403 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001404 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001405 None),
1406 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001407 (b'\xe2\x80\x80',
1408 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001409 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001410 (b'\xe2\x80\x8b',
1411 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001412 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001413 (b'\xe3\x80\x80',
1414 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001415 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001416 (b'\x10\x7f',
1417 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001418 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001419 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001420 None),
1421 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001422 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001423 None),
1424 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001425 (b'\xef\xbb\xbf',
1426 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001427 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001428 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001429 None),
1430 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001431 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001432 None),
1433 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001434 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001435 None),
1436 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001437 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001438 None),
1439 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001440 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001441 None),
1442 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001443 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001444 None),
1445 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001446 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001447 None),
1448 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001449 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001450 None),
1451 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001452 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001453 None),
1454 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001455 (b'\xcd\x81',
1456 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001457 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001458 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001459 None),
1460 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001461 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001462 None),
1463 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001464 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001465 None),
1466 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001467 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001468 None),
1469 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001470 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001471 None),
1472 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001473 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001474 None),
1475 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001476 (b'foo\xef\xb9\xb6bar',
1477 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001478 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001479 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001480 None),
1481 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001482 (b'\xd8\xa71\xd8\xa8',
1483 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001484 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001485 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001486 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001487 # None),
1488 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001489 # 3.44 Larger test (shrinking).
1490 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001491 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1492 b'\xaa\xce\xb0\xe2\x80\x80',
1493 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001494 # 3.45 Larger test (expanding).
1495 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001496 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1497 b'\x80',
1498 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1499 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1500 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001501 ]
1502
1503
1504class NameprepTest(unittest.TestCase):
1505 def test_nameprep(self):
1506 from encodings.idna import nameprep
1507 for pos, (orig, prepped) in enumerate(nameprep_tests):
1508 if orig is None:
1509 # Skipped
1510 continue
1511 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001512 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001513 if prepped is None:
1514 # Input contains prohibited characters
1515 self.assertRaises(UnicodeError, nameprep, orig)
1516 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001517 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001518 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001519 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001520 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001521 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001522
Victor Stinnerf96418d2015-09-21 23:06:27 +02001523
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001524class IDNACodecTest(unittest.TestCase):
1525 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001526 self.assertEqual(str(b"python.org", "idna"), "python.org")
1527 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1528 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1529 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001530
1531 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001532 self.assertEqual("python.org".encode("idna"), b"python.org")
1533 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1534 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1535 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001536
Martin v. Löwis8b595142005-08-25 11:03:38 +00001537 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001538 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001539 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001540 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001541
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001542 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001543 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001544 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001545 "python.org"
1546 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001547 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001548 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001549 "python.org."
1550 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001551 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001552 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001553 "pyth\xf6n.org."
1554 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001555 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001556 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001557 "pyth\xf6n.org."
1558 )
1559
1560 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001561 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1562 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1563 self.assertEqual(decoder.decode(b"rg"), "")
1564 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001565
1566 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001567 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1568 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1569 self.assertEqual(decoder.decode(b"rg."), "org.")
1570 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001571
1572 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001573 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001574 b"".join(codecs.iterencode("python.org", "idna")),
1575 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001576 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001577 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001578 b"".join(codecs.iterencode("python.org.", "idna")),
1579 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001580 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001581 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001582 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1583 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001584 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001585 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001586 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1587 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001588 )
1589
1590 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001591 self.assertEqual(encoder.encode("\xe4x"), b"")
1592 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1593 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001594
1595 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001596 self.assertEqual(encoder.encode("\xe4x"), b"")
1597 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1598 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001599
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001600 def test_errors(self):
1601 """Only supports "strict" error handler"""
1602 "python.org".encode("idna", "strict")
1603 b"python.org".decode("idna", "strict")
1604 for errors in ("ignore", "replace", "backslashreplace",
1605 "surrogateescape"):
1606 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1607 self.assertRaises(Exception,
1608 b"python.org".decode, "idna", errors)
1609
Victor Stinnerf96418d2015-09-21 23:06:27 +02001610
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001611class CodecsModuleTest(unittest.TestCase):
1612
1613 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001614 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1615 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001616 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001617 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001618 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001619
Victor Stinnera57dfd02014-05-14 17:13:14 +02001620 # test keywords
1621 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1622 '\xe4\xf6\xfc')
1623 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1624 '[]')
1625
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001626 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001627 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1628 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001629 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001630 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001631 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001632 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001633
Victor Stinnera57dfd02014-05-14 17:13:14 +02001634 # test keywords
1635 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1636 b'\xe4\xf6\xfc')
1637 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1638 b'[]')
1639
Walter Dörwald063e1e82004-10-28 13:04:26 +00001640 def test_register(self):
1641 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001642 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001643
Hai Shid332e7b2020-09-29 05:41:11 +08001644 def test_unregister(self):
1645 name = "nonexistent_codec_name"
1646 search_function = mock.Mock()
1647 codecs.register(search_function)
1648 self.assertRaises(TypeError, codecs.lookup, name)
1649 search_function.assert_called_with(name)
1650 search_function.reset_mock()
1651
1652 codecs.unregister(search_function)
1653 self.assertRaises(LookupError, codecs.lookup, name)
1654 search_function.assert_not_called()
1655
Walter Dörwald063e1e82004-10-28 13:04:26 +00001656 def test_lookup(self):
1657 self.assertRaises(TypeError, codecs.lookup)
1658 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001659 self.assertRaises(LookupError, codecs.lookup, " ")
1660
1661 def test_getencoder(self):
1662 self.assertRaises(TypeError, codecs.getencoder)
1663 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1664
1665 def test_getdecoder(self):
1666 self.assertRaises(TypeError, codecs.getdecoder)
1667 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1668
1669 def test_getreader(self):
1670 self.assertRaises(TypeError, codecs.getreader)
1671 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1672
1673 def test_getwriter(self):
1674 self.assertRaises(TypeError, codecs.getwriter)
1675 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001676
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001677 def test_lookup_issue1813(self):
1678 # Issue #1813: under Turkish locales, lookup of some codecs failed
1679 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001680 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001681 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1682 try:
1683 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1684 except locale.Error:
1685 # Unsupported locale on this system
1686 self.skipTest('test needs Turkish locale')
1687 c = codecs.lookup('ASCII')
1688 self.assertEqual(c.name, 'ascii')
1689
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001690 def test_all(self):
1691 api = (
1692 "encode", "decode",
1693 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1694 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1695 "getencoder", "getdecoder", "getincrementalencoder",
1696 "getincrementaldecoder", "getreader", "getwriter",
1697 "register_error", "lookup_error",
1698 "strict_errors", "replace_errors", "ignore_errors",
1699 "xmlcharrefreplace_errors", "backslashreplace_errors",
1700 "namereplace_errors",
1701 "open", "EncodedFile",
1702 "iterencode", "iterdecode",
1703 "BOM", "BOM_BE", "BOM_LE",
1704 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1705 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1706 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1707 "StreamReaderWriter", "StreamRecoder",
1708 )
1709 self.assertCountEqual(api, codecs.__all__)
1710 for api in codecs.__all__:
1711 getattr(codecs, api)
1712
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001713 def test_open(self):
Hai Shi46605972020-08-04 00:49:18 +08001714 self.addCleanup(os_helper.unlink, os_helper.TESTFN)
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001715 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1716 with self.subTest(mode), \
Hai Shi46605972020-08-04 00:49:18 +08001717 codecs.open(os_helper.TESTFN, mode, 'ascii') as file:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001718 self.assertIsInstance(file, codecs.StreamReaderWriter)
1719
1720 def test_undefined(self):
1721 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1722 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1723 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1724 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1725 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1726 self.assertRaises(UnicodeError,
1727 codecs.encode, 'abc', 'undefined', errors)
1728 self.assertRaises(UnicodeError,
1729 codecs.decode, b'abc', 'undefined', errors)
1730
Chris A2565ede2020-03-02 01:39:50 -05001731 def test_file_closes_if_lookup_error_raised(self):
1732 mock_open = mock.mock_open()
1733 with mock.patch('builtins.open', mock_open) as file:
1734 with self.assertRaises(LookupError):
Hai Shi46605972020-08-04 00:49:18 +08001735 codecs.open(os_helper.TESTFN, 'wt', 'invalid-encoding')
Chris A2565ede2020-03-02 01:39:50 -05001736
1737 file().close.assert_called()
1738
Victor Stinnerf96418d2015-09-21 23:06:27 +02001739
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001740class StreamReaderTest(unittest.TestCase):
1741
1742 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001743 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001744 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001745
1746 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001747 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001748 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001749
Victor Stinnerf96418d2015-09-21 23:06:27 +02001750
Thomas Wouters89f507f2006-12-13 04:49:30 +00001751class EncodedFileTest(unittest.TestCase):
1752
1753 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001754 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001755 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001756 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001757
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001758 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001759 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001760 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001761 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001762
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001763all_unicode_encodings = [
1764 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001765 "big5",
1766 "big5hkscs",
1767 "charmap",
1768 "cp037",
1769 "cp1006",
1770 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001771 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001772 "cp1140",
1773 "cp1250",
1774 "cp1251",
1775 "cp1252",
1776 "cp1253",
1777 "cp1254",
1778 "cp1255",
1779 "cp1256",
1780 "cp1257",
1781 "cp1258",
1782 "cp424",
1783 "cp437",
1784 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001785 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001786 "cp737",
1787 "cp775",
1788 "cp850",
1789 "cp852",
1790 "cp855",
1791 "cp856",
1792 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001793 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001794 "cp860",
1795 "cp861",
1796 "cp862",
1797 "cp863",
1798 "cp864",
1799 "cp865",
1800 "cp866",
1801 "cp869",
1802 "cp874",
1803 "cp875",
1804 "cp932",
1805 "cp949",
1806 "cp950",
1807 "euc_jis_2004",
1808 "euc_jisx0213",
1809 "euc_jp",
1810 "euc_kr",
1811 "gb18030",
1812 "gb2312",
1813 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001814 "hp_roman8",
1815 "hz",
1816 "idna",
1817 "iso2022_jp",
1818 "iso2022_jp_1",
1819 "iso2022_jp_2",
1820 "iso2022_jp_2004",
1821 "iso2022_jp_3",
1822 "iso2022_jp_ext",
1823 "iso2022_kr",
1824 "iso8859_1",
1825 "iso8859_10",
1826 "iso8859_11",
1827 "iso8859_13",
1828 "iso8859_14",
1829 "iso8859_15",
1830 "iso8859_16",
1831 "iso8859_2",
1832 "iso8859_3",
1833 "iso8859_4",
1834 "iso8859_5",
1835 "iso8859_6",
1836 "iso8859_7",
1837 "iso8859_8",
1838 "iso8859_9",
1839 "johab",
1840 "koi8_r",
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03001841 "koi8_t",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001842 "koi8_u",
Serhiy Storchakaad8a1c32015-05-12 23:16:55 +03001843 "kz1048",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001844 "latin_1",
1845 "mac_cyrillic",
1846 "mac_greek",
1847 "mac_iceland",
1848 "mac_latin2",
1849 "mac_roman",
1850 "mac_turkish",
1851 "palmos",
1852 "ptcp154",
1853 "punycode",
1854 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001855 "shift_jis",
1856 "shift_jis_2004",
1857 "shift_jisx0213",
1858 "tis_620",
1859 "unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001860 "utf_16",
1861 "utf_16_be",
1862 "utf_16_le",
1863 "utf_7",
1864 "utf_8",
1865]
1866
1867if hasattr(codecs, "mbcs_encode"):
1868 all_unicode_encodings.append("mbcs")
Steve Dowerf5aba582016-09-06 19:42:27 -07001869if hasattr(codecs, "oem_encode"):
1870 all_unicode_encodings.append("oem")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001871
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001872# The following encoding is not tested, because it's not supposed
1873# to work:
1874# "undefined"
1875
1876# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001877broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001878 "punycode",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001879]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001880
Victor Stinnerf96418d2015-09-21 23:06:27 +02001881
Walter Dörwald3abcb012007-04-16 22:10:50 +00001882class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001883 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001884 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001885 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001886 name = codecs.lookup(encoding).name
1887 if encoding.endswith("_codec"):
1888 name += "_codec"
1889 elif encoding == "latin_1":
1890 name = "latin_1"
1891 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001892
Inada Naoki6a16b182019-03-18 15:44:11 +09001893 (b, size) = codecs.getencoder(encoding)(s)
1894 self.assertEqual(size, len(s), "encoding=%r" % encoding)
1895 (chars, size) = codecs.getdecoder(encoding)(b)
1896 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001897
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001898 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001899 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001900 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001901 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001902 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001903 for c in s:
1904 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001905 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001906 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001907 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001908 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001909 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001910 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001911 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001912 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001913 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001914 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001915
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001916 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001917 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001918 try:
1919 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001920 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001921 pass
1922 else:
1923 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001924 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001925 for c in s:
1926 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001927 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001928 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001929 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001930 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001931 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001932 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001933 self.assertEqual(decodedresult, s,
1934 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001935
1936 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001937 result = "".join(codecs.iterdecode(
1938 codecs.iterencode(s, encoding), encoding))
1939 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001940
1941 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001942 result = "".join(codecs.iterdecode(
1943 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001944 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001945
Victor Stinner554f3f02010-06-16 23:33:54 +00001946 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001947 # check incremental decoder/encoder with errors argument
1948 try:
1949 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001950 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001951 pass
1952 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001953 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001954 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001955 decodedresult = "".join(decoder.decode(bytes([c]))
1956 for c in encodedresult)
1957 self.assertEqual(decodedresult, s,
1958 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001959
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001960 @support.cpython_only
1961 def test_basics_capi(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001962 s = "abc123" # all codecs should be able to encode these
1963 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001964 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001965 # check incremental decoder/encoder (fetched via the C API)
1966 try:
Victor Stinner3d4226a2018-08-29 22:21:32 +02001967 cencoder = _testcapi.codec_incrementalencoder(encoding)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001968 except LookupError: # no IncrementalEncoder
1969 pass
1970 else:
1971 # check C API
1972 encodedresult = b""
1973 for c in s:
1974 encodedresult += cencoder.encode(c)
1975 encodedresult += cencoder.encode("", True)
Victor Stinner3d4226a2018-08-29 22:21:32 +02001976 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001977 decodedresult = ""
1978 for c in encodedresult:
1979 decodedresult += cdecoder.decode(bytes([c]))
1980 decodedresult += cdecoder.decode(b"", True)
1981 self.assertEqual(decodedresult, s,
1982 "encoding=%r" % encoding)
1983
1984 if encoding not in ("idna", "mbcs"):
1985 # check incremental decoder/encoder with errors argument
1986 try:
Victor Stinner3d4226a2018-08-29 22:21:32 +02001987 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001988 except LookupError: # no IncrementalEncoder
1989 pass
1990 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001991 encodedresult = b"".join(cencoder.encode(c) for c in s)
Victor Stinner3d4226a2018-08-29 22:21:32 +02001992 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001993 decodedresult = "".join(cdecoder.decode(bytes([c]))
1994 for c in encodedresult)
1995 self.assertEqual(decodedresult, s,
1996 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001997
Walter Dörwald729c31f2005-03-14 19:06:30 +00001998 def test_seek(self):
1999 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002000 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00002001 for encoding in all_unicode_encodings:
2002 if encoding == "idna": # FIXME: See SF bug #1163178
2003 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002004 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00002005 continue
Victor Stinner05010702011-05-27 16:50:40 +02002006 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00002007 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00002008 # Test that calling seek resets the internal codec state and buffers
2009 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002010 data = reader.read()
2011 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00002012
Walter Dörwalde22d3392005-11-17 08:52:34 +00002013 def test_bad_decode_args(self):
2014 for encoding in all_unicode_encodings:
2015 decoder = codecs.getdecoder(encoding)
2016 self.assertRaises(TypeError, decoder)
2017 if encoding not in ("idna", "punycode"):
2018 self.assertRaises(TypeError, decoder, 42)
2019
2020 def test_bad_encode_args(self):
2021 for encoding in all_unicode_encodings:
2022 encoder = codecs.getencoder(encoding)
Inada Naoki6a16b182019-03-18 15:44:11 +09002023 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002024
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002025 def test_encoding_map_type_initialized(self):
2026 from encodings import cp1140
2027 # This used to crash, we are only verifying there's no crash.
2028 table_type = type(cp1140.encoding_table)
2029 self.assertEqual(table_type, table_type)
2030
Walter Dörwald3abcb012007-04-16 22:10:50 +00002031 def test_decoder_state(self):
2032 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002033 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002034 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002035 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002036 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2037 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2038
Victor Stinnerf96418d2015-09-21 23:06:27 +02002039
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002040class CharmapTest(unittest.TestCase):
2041 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002042 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002043 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002044 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002045 )
2046
Ezio Melottib3aedd42010-11-20 19:04:17 +00002047 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002048 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2049 ("\U0010FFFFbc", 3)
2050 )
2051
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002052 self.assertRaises(UnicodeDecodeError,
2053 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2054 )
2055
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002056 self.assertRaises(UnicodeDecodeError,
2057 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2058 )
2059
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002060 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002061 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002062 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002063 )
2064
Ezio Melottib3aedd42010-11-20 19:04:17 +00002065 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002066 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002067 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002068 )
2069
Ezio Melottib3aedd42010-11-20 19:04:17 +00002070 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002071 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2072 ("ab\\x02", 3)
2073 )
2074
2075 self.assertEqual(
2076 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2077 ("ab\\x02", 3)
2078 )
2079
2080 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002081 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002082 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002083 )
2084
Ezio Melottib3aedd42010-11-20 19:04:17 +00002085 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002086 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002087 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002088 )
2089
Guido van Rossum805365e2007-05-07 22:24:25 +00002090 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002091 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002092 codecs.charmap_decode(allbytes, "ignore", ""),
2093 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002094 )
2095
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002096 def test_decode_with_int2str_map(self):
2097 self.assertEqual(
2098 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2099 {0: 'a', 1: 'b', 2: 'c'}),
2100 ("abc", 3)
2101 )
2102
2103 self.assertEqual(
2104 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2105 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2106 ("AaBbCc", 3)
2107 )
2108
2109 self.assertEqual(
2110 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2111 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2112 ("\U0010FFFFbc", 3)
2113 )
2114
2115 self.assertEqual(
2116 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2117 {0: 'a', 1: 'b', 2: ''}),
2118 ("ab", 3)
2119 )
2120
2121 self.assertRaises(UnicodeDecodeError,
2122 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2123 {0: 'a', 1: 'b'}
2124 )
2125
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002126 self.assertRaises(UnicodeDecodeError,
2127 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2128 {0: 'a', 1: 'b', 2: None}
2129 )
2130
2131 # Issue #14850
2132 self.assertRaises(UnicodeDecodeError,
2133 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2134 {0: 'a', 1: 'b', 2: '\ufffe'}
2135 )
2136
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002137 self.assertEqual(
2138 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2139 {0: 'a', 1: 'b'}),
2140 ("ab\ufffd", 3)
2141 )
2142
2143 self.assertEqual(
2144 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2145 {0: 'a', 1: 'b', 2: None}),
2146 ("ab\ufffd", 3)
2147 )
2148
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002149 # Issue #14850
2150 self.assertEqual(
2151 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2152 {0: 'a', 1: 'b', 2: '\ufffe'}),
2153 ("ab\ufffd", 3)
2154 )
2155
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002156 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002157 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2158 {0: 'a', 1: 'b'}),
2159 ("ab\\x02", 3)
2160 )
2161
2162 self.assertEqual(
2163 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2164 {0: 'a', 1: 'b', 2: None}),
2165 ("ab\\x02", 3)
2166 )
2167
2168 # Issue #14850
2169 self.assertEqual(
2170 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2171 {0: 'a', 1: 'b', 2: '\ufffe'}),
2172 ("ab\\x02", 3)
2173 )
2174
2175 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002176 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2177 {0: 'a', 1: 'b'}),
2178 ("ab", 3)
2179 )
2180
2181 self.assertEqual(
2182 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2183 {0: 'a', 1: 'b', 2: None}),
2184 ("ab", 3)
2185 )
2186
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002187 # Issue #14850
2188 self.assertEqual(
2189 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2190 {0: 'a', 1: 'b', 2: '\ufffe'}),
2191 ("ab", 3)
2192 )
2193
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002194 allbytes = bytes(range(256))
2195 self.assertEqual(
2196 codecs.charmap_decode(allbytes, "ignore", {}),
2197 ("", len(allbytes))
2198 )
2199
Max Bernstein36353882020-10-17 13:38:21 -07002200 self.assertRaisesRegex(TypeError,
2201 "character mapping must be in range\\(0x110000\\)",
2202 codecs.charmap_decode,
2203 b"\x00\x01\x02", "strict", {0: "A", 1: 'Bb', 2: -2}
2204 )
2205
2206 self.assertRaisesRegex(TypeError,
2207 "character mapping must be in range\\(0x110000\\)",
2208 codecs.charmap_decode,
2209 b"\x00\x01\x02", "strict", {0: "A", 1: 'Bb', 2: 999999999}
2210 )
2211
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002212 def test_decode_with_int2int_map(self):
2213 a = ord('a')
2214 b = ord('b')
2215 c = ord('c')
2216
2217 self.assertEqual(
2218 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2219 {0: a, 1: b, 2: c}),
2220 ("abc", 3)
2221 )
2222
2223 # Issue #15379
2224 self.assertEqual(
2225 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2226 {0: 0x10FFFF, 1: b, 2: c}),
2227 ("\U0010FFFFbc", 3)
2228 )
2229
Antoine Pitroua1f76552012-09-23 20:00:04 +02002230 self.assertEqual(
2231 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2232 {0: sys.maxunicode, 1: b, 2: c}),
2233 (chr(sys.maxunicode) + "bc", 3)
2234 )
2235
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002236 self.assertRaises(TypeError,
2237 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002238 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002239 )
2240
2241 self.assertRaises(UnicodeDecodeError,
2242 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2243 {0: a, 1: b},
2244 )
2245
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002246 self.assertRaises(UnicodeDecodeError,
2247 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2248 {0: a, 1: b, 2: 0xFFFE},
2249 )
2250
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002251 self.assertEqual(
2252 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2253 {0: a, 1: b}),
2254 ("ab\ufffd", 3)
2255 )
2256
2257 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002258 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2259 {0: a, 1: b, 2: 0xFFFE}),
2260 ("ab\ufffd", 3)
2261 )
2262
2263 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002264 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2265 {0: a, 1: b}),
2266 ("ab\\x02", 3)
2267 )
2268
2269 self.assertEqual(
2270 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2271 {0: a, 1: b, 2: 0xFFFE}),
2272 ("ab\\x02", 3)
2273 )
2274
2275 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002276 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2277 {0: a, 1: b}),
2278 ("ab", 3)
2279 )
2280
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002281 self.assertEqual(
2282 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2283 {0: a, 1: b, 2: 0xFFFE}),
2284 ("ab", 3)
2285 )
2286
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002287
Thomas Wouters89f507f2006-12-13 04:49:30 +00002288class WithStmtTest(unittest.TestCase):
2289 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002290 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002291 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2292 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002293 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002294
2295 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002296 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002297 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002298 with codecs.StreamReaderWriter(f, info.streamreader,
2299 info.streamwriter, 'strict') as srw:
2300 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002301
Victor Stinnerf96418d2015-09-21 23:06:27 +02002302
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002303class TypesTest(unittest.TestCase):
2304 def test_decode_unicode(self):
2305 # Most decoders don't accept unicode input
2306 decoders = [
2307 codecs.utf_7_decode,
2308 codecs.utf_8_decode,
2309 codecs.utf_16_le_decode,
2310 codecs.utf_16_be_decode,
2311 codecs.utf_16_ex_decode,
2312 codecs.utf_32_decode,
2313 codecs.utf_32_le_decode,
2314 codecs.utf_32_be_decode,
2315 codecs.utf_32_ex_decode,
2316 codecs.latin_1_decode,
2317 codecs.ascii_decode,
2318 codecs.charmap_decode,
2319 ]
2320 if hasattr(codecs, "mbcs_decode"):
2321 decoders.append(codecs.mbcs_decode)
2322 for decoder in decoders:
2323 self.assertRaises(TypeError, decoder, "xxx")
2324
2325 def test_unicode_escape(self):
Martin Panter119e5022016-04-16 09:28:57 +00002326 # Escape-decoding a unicode string is supported and gives the same
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002327 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002328 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2329 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2330 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2331 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002332
Victor Stinnere3b47152011-12-09 20:49:49 +01002333 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2334 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002335 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2336 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002337
2338 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2339 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002340 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2341 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002342
Serhiy Storchakad6793772013-01-29 10:20:44 +02002343
2344class UnicodeEscapeTest(unittest.TestCase):
2345 def test_empty(self):
2346 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2347 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2348
2349 def test_raw_encode(self):
2350 encode = codecs.unicode_escape_encode
2351 for b in range(32, 127):
2352 if b != b'\\'[0]:
2353 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2354
2355 def test_raw_decode(self):
2356 decode = codecs.unicode_escape_decode
2357 for b in range(256):
2358 if b != b'\\'[0]:
2359 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2360
2361 def test_escape_encode(self):
2362 encode = codecs.unicode_escape_encode
2363 check = coding_checker(self, encode)
2364 check('\t', br'\t')
2365 check('\n', br'\n')
2366 check('\r', br'\r')
2367 check('\\', br'\\')
2368 for b in range(32):
2369 if chr(b) not in '\t\n\r':
2370 check(chr(b), ('\\x%02x' % b).encode())
2371 for b in range(127, 256):
2372 check(chr(b), ('\\x%02x' % b).encode())
2373 check('\u20ac', br'\u20ac')
2374 check('\U0001d120', br'\U0001d120')
2375
2376 def test_escape_decode(self):
2377 decode = codecs.unicode_escape_decode
2378 check = coding_checker(self, decode)
2379 check(b"[\\\n]", "[]")
2380 check(br'[\"]', '["]')
2381 check(br"[\']", "[']")
2382 check(br"[\\]", r"[\]")
2383 check(br"[\a]", "[\x07]")
2384 check(br"[\b]", "[\x08]")
2385 check(br"[\t]", "[\x09]")
2386 check(br"[\n]", "[\x0a]")
2387 check(br"[\v]", "[\x0b]")
2388 check(br"[\f]", "[\x0c]")
2389 check(br"[\r]", "[\x0d]")
2390 check(br"[\7]", "[\x07]")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002391 check(br"[\78]", "[\x078]")
2392 check(br"[\41]", "[!]")
2393 check(br"[\418]", "[!8]")
2394 check(br"[\101]", "[A]")
2395 check(br"[\1010]", "[A0]")
2396 check(br"[\x41]", "[A]")
2397 check(br"[\x410]", "[A0]")
2398 check(br"\u20ac", "\u20ac")
2399 check(br"\U0001d120", "\U0001d120")
R David Murray110b6fe2016-09-08 15:34:08 -04002400 for i in range(97, 123):
2401 b = bytes([i])
2402 if b not in b'abfnrtuvx':
2403 with self.assertWarns(DeprecationWarning):
2404 check(b"\\" + b, "\\" + chr(i))
2405 if b.upper() not in b'UN':
2406 with self.assertWarns(DeprecationWarning):
2407 check(b"\\" + b.upper(), "\\" + chr(i-32))
2408 with self.assertWarns(DeprecationWarning):
2409 check(br"\8", "\\8")
2410 with self.assertWarns(DeprecationWarning):
2411 check(br"\9", "\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03002412 with self.assertWarns(DeprecationWarning):
2413 check(b"\\\xfa", "\\\xfa")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002414
2415 def test_decode_errors(self):
2416 decode = codecs.unicode_escape_decode
2417 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2418 for i in range(d):
2419 self.assertRaises(UnicodeDecodeError, decode,
2420 b"\\" + c + b"0"*i)
2421 self.assertRaises(UnicodeDecodeError, decode,
2422 b"[\\" + c + b"0"*i + b"]")
2423 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2424 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2425 self.assertEqual(decode(data, "replace"),
2426 ("[\ufffd]\ufffd", len(data)))
2427 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2428 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2429 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2430
2431
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002432class RawUnicodeEscapeTest(unittest.TestCase):
2433 def test_empty(self):
2434 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2435 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2436
2437 def test_raw_encode(self):
2438 encode = codecs.raw_unicode_escape_encode
2439 for b in range(256):
2440 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2441
2442 def test_raw_decode(self):
2443 decode = codecs.raw_unicode_escape_decode
2444 for b in range(256):
2445 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2446
2447 def test_escape_encode(self):
2448 encode = codecs.raw_unicode_escape_encode
2449 check = coding_checker(self, encode)
2450 for b in range(256):
2451 if b not in b'uU':
2452 check('\\' + chr(b), b'\\' + bytes([b]))
2453 check('\u20ac', br'\u20ac')
2454 check('\U0001d120', br'\U0001d120')
2455
2456 def test_escape_decode(self):
2457 decode = codecs.raw_unicode_escape_decode
2458 check = coding_checker(self, decode)
2459 for b in range(256):
2460 if b not in b'uU':
2461 check(b'\\' + bytes([b]), '\\' + chr(b))
2462 check(br"\u20ac", "\u20ac")
2463 check(br"\U0001d120", "\U0001d120")
2464
2465 def test_decode_errors(self):
2466 decode = codecs.raw_unicode_escape_decode
2467 for c, d in (b'u', 4), (b'U', 4):
2468 for i in range(d):
2469 self.assertRaises(UnicodeDecodeError, decode,
2470 b"\\" + c + b"0"*i)
2471 self.assertRaises(UnicodeDecodeError, decode,
2472 b"[\\" + c + b"0"*i + b"]")
2473 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2474 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2475 self.assertEqual(decode(data, "replace"),
2476 ("[\ufffd]\ufffd", len(data)))
2477 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2478 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2479 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2480
2481
Berker Peksag4a72a7b2016-09-16 17:31:06 +03002482class EscapeEncodeTest(unittest.TestCase):
2483
2484 def test_escape_encode(self):
2485 tests = [
2486 (b'', (b'', 0)),
2487 (b'foobar', (b'foobar', 6)),
2488 (b'spam\0eggs', (b'spam\\x00eggs', 9)),
2489 (b'a\'b', (b"a\\'b", 3)),
2490 (b'b\\c', (b'b\\\\c', 3)),
2491 (b'c\nd', (b'c\\nd', 3)),
2492 (b'd\re', (b'd\\re', 3)),
2493 (b'f\x7fg', (b'f\\x7fg', 3)),
2494 ]
2495 for data, output in tests:
2496 with self.subTest(data=data):
2497 self.assertEqual(codecs.escape_encode(data), output)
2498 self.assertRaises(TypeError, codecs.escape_encode, 'spam')
2499 self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam'))
2500
2501
Martin v. Löwis43c57782009-05-10 08:15:24 +00002502class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002503
2504 def test_utf8(self):
2505 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002506 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002507 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002508 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002509 b"foo\x80bar")
2510 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002511 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002512 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002513 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002514 b"\xed\xb0\x80")
2515
2516 def test_ascii(self):
2517 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002518 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002519 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002520 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002521 b"foo\x80bar")
2522
2523 def test_charmap(self):
2524 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002525 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002526 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002527 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002528 b"foo\xa5bar")
2529
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002530 def test_latin1(self):
2531 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002532 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002533 b"\xe4\xeb\xef\xf6\xfc")
2534
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002535
Victor Stinner3fed0872010-05-22 02:16:27 +00002536class BomTest(unittest.TestCase):
2537 def test_seek0(self):
2538 data = "1234567890"
2539 tests = ("utf-16",
2540 "utf-16-le",
2541 "utf-16-be",
2542 "utf-32",
2543 "utf-32-le",
2544 "utf-32-be")
Hai Shi46605972020-08-04 00:49:18 +08002545 self.addCleanup(os_helper.unlink, os_helper.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002546 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002547 # Check if the BOM is written only once
Hai Shi46605972020-08-04 00:49:18 +08002548 with codecs.open(os_helper.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002549 f.write(data)
2550 f.write(data)
2551 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002552 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002553 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002554 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002555
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002556 # Check that the BOM is written after a seek(0)
Hai Shi46605972020-08-04 00:49:18 +08002557 with codecs.open(os_helper.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002558 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002559 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002560 f.seek(0)
2561 f.write(data)
2562 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002563 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002564
2565 # (StreamWriter) Check that the BOM is written after a seek(0)
Hai Shi46605972020-08-04 00:49:18 +08002566 with codecs.open(os_helper.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002567 f.writer.write(data[0])
2568 self.assertNotEqual(f.writer.tell(), 0)
2569 f.writer.seek(0)
2570 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002571 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002572 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002573
Victor Stinner05010702011-05-27 16:50:40 +02002574 # Check that the BOM is not written after a seek() at a position
2575 # different than the start
Hai Shi46605972020-08-04 00:49:18 +08002576 with codecs.open(os_helper.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002577 f.write(data)
2578 f.seek(f.tell())
2579 f.write(data)
2580 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002581 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002582
Victor Stinner05010702011-05-27 16:50:40 +02002583 # (StreamWriter) Check that the BOM is not written after a seek()
2584 # at a position different than the start
Hai Shi46605972020-08-04 00:49:18 +08002585 with codecs.open(os_helper.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002586 f.writer.write(data)
2587 f.writer.seek(f.writer.tell())
2588 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002589 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002590 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002591
Victor Stinner3fed0872010-05-22 02:16:27 +00002592
Georg Brandl02524622010-12-02 18:06:51 +00002593bytes_transform_encodings = [
2594 "base64_codec",
2595 "uu_codec",
2596 "quopri_codec",
2597 "hex_codec",
2598]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002599
2600transform_aliases = {
2601 "base64_codec": ["base64", "base_64"],
2602 "uu_codec": ["uu"],
2603 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2604 "hex_codec": ["hex"],
2605 "rot_13": ["rot13"],
2606}
2607
Georg Brandl02524622010-12-02 18:06:51 +00002608try:
2609 import zlib
2610except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002611 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002612else:
2613 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002614 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002615try:
2616 import bz2
2617except ImportError:
2618 pass
2619else:
2620 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002621 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002622
Victor Stinnerf96418d2015-09-21 23:06:27 +02002623
Georg Brandl02524622010-12-02 18:06:51 +00002624class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002625
Georg Brandl02524622010-12-02 18:06:51 +00002626 def test_basics(self):
2627 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002628 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002629 with self.subTest(encoding=encoding):
2630 # generic codecs interface
2631 (o, size) = codecs.getencoder(encoding)(binput)
2632 self.assertEqual(size, len(binput))
2633 (i, size) = codecs.getdecoder(encoding)(o)
2634 self.assertEqual(size, len(o))
2635 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002636
Georg Brandl02524622010-12-02 18:06:51 +00002637 def test_read(self):
2638 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002639 with self.subTest(encoding=encoding):
2640 sin = codecs.encode(b"\x80", encoding)
2641 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2642 sout = reader.read()
2643 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002644
2645 def test_readline(self):
2646 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002647 with self.subTest(encoding=encoding):
2648 sin = codecs.encode(b"\x80", encoding)
2649 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2650 sout = reader.readline()
2651 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002652
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002653 def test_buffer_api_usage(self):
2654 # We check all the transform codecs accept memoryview input
2655 # for encoding and decoding
2656 # and also that they roundtrip correctly
2657 original = b"12345\x80"
2658 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002659 with self.subTest(encoding=encoding):
2660 data = original
2661 view = memoryview(data)
2662 data = codecs.encode(data, encoding)
2663 view_encoded = codecs.encode(view, encoding)
2664 self.assertEqual(view_encoded, data)
2665 view = memoryview(data)
2666 data = codecs.decode(data, encoding)
2667 self.assertEqual(data, original)
2668 view_decoded = codecs.decode(view, encoding)
2669 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002670
Victor Stinner0ee0b292020-08-11 15:28:43 +02002671 def test_text_to_binary_denylists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002672 # Check binary -> binary codecs give a good error for str input
2673 bad_input = "bad input type"
2674 for encoding in bytes_transform_encodings:
2675 with self.subTest(encoding=encoding):
R David Murray44b548d2016-09-08 13:59:53 -04002676 fmt = (r"{!r} is not a text encoding; "
2677 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002678 msg = fmt.format(encoding)
2679 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002680 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002681 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002682
Victor Stinner0ee0b292020-08-11 15:28:43 +02002683 def test_text_to_binary_denylists_text_transforms(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002684 # Check str.encode gives a good error message for str -> str codecs
2685 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002686 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002687 with self.assertRaisesRegex(LookupError, msg):
2688 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002689
Victor Stinner0ee0b292020-08-11 15:28:43 +02002690 def test_binary_to_text_denylists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002691 # Check bytes.decode and bytearray.decode give a good error
2692 # message for binary -> binary codecs
2693 data = b"encode first to ensure we meet any format restrictions"
2694 for encoding in bytes_transform_encodings:
2695 with self.subTest(encoding=encoding):
2696 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002697 fmt = (r"{!r} is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002698 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002699 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002700 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002701 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002702 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002703 bytearray(encoded_data).decode(encoding)
2704
Victor Stinner0ee0b292020-08-11 15:28:43 +02002705 def test_binary_to_text_denylists_text_transforms(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002706 # Check str -> str codec gives a good error for binary input
2707 for bad_input in (b"immutable", bytearray(b"mutable")):
2708 with self.subTest(bad_input=bad_input):
2709 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002710 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002711 with self.assertRaisesRegex(LookupError, msg) as failure:
2712 bad_input.decode("rot_13")
2713 self.assertIsNone(failure.exception.__cause__)
2714
Zachary Wareefa2e042013-12-30 14:54:11 -06002715 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002716 def test_custom_zlib_error_is_wrapped(self):
2717 # Check zlib codec gives a good error for malformed input
2718 msg = "^decoding with 'zlib_codec' codec failed"
2719 with self.assertRaisesRegex(Exception, msg) as failure:
2720 codecs.decode(b"hello", "zlib_codec")
2721 self.assertIsInstance(failure.exception.__cause__,
2722 type(failure.exception))
2723
2724 def test_custom_hex_error_is_wrapped(self):
2725 # Check hex codec gives a good error for malformed input
2726 msg = "^decoding with 'hex_codec' codec failed"
2727 with self.assertRaisesRegex(Exception, msg) as failure:
2728 codecs.decode(b"hello", "hex_codec")
2729 self.assertIsInstance(failure.exception.__cause__,
2730 type(failure.exception))
2731
2732 # Unfortunately, the bz2 module throws OSError, which the codec
2733 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002734
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002735 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2736 def test_aliases(self):
2737 for codec_name, aliases in transform_aliases.items():
2738 expected_name = codecs.lookup(codec_name).name
2739 for alias in aliases:
2740 with self.subTest(alias=alias):
2741 info = codecs.lookup(alias)
2742 self.assertEqual(info.name, expected_name)
2743
Martin Panter06171bd2015-09-12 00:34:28 +00002744 def test_quopri_stateless(self):
2745 # Should encode with quotetabs=True
2746 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2747 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2748 # But should still support unescaped tabs and spaces
2749 unescaped = b"space tab eol\n"
2750 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2751
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002752 def test_uu_invalid(self):
2753 # Missing "begin" line
2754 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2755
Nick Coghlan8b097b42013-11-13 23:49:21 +10002756
2757# The codec system tries to wrap exceptions in order to ensure the error
2758# mentions the operation being performed and the codec involved. We
2759# currently *only* want this to happen for relatively stateless
2760# exceptions, where the only significant information they contain is their
2761# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002762
2763# Use a local codec registry to avoid appearing to leak objects when
Martin Panter119e5022016-04-16 09:28:57 +00002764# registering multiple search functions
Nick Coghlan4e553e22013-11-16 00:35:34 +10002765_TEST_CODECS = {}
2766
2767def _get_test_codec(codec_name):
2768 return _TEST_CODECS.get(codec_name)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002769
2770
Nick Coghlan8b097b42013-11-13 23:49:21 +10002771class ExceptionChainingTest(unittest.TestCase):
2772
2773 def setUp(self):
Hai Shic9f696c2020-10-16 16:34:15 +08002774 self.codec_name = 'exception_chaining_test'
2775 codecs.register(_get_test_codec)
2776 self.addCleanup(codecs.unregister, _get_test_codec)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002777
2778 # We store the object to raise on the instance because of a bad
2779 # interaction between the codec caching (which means we can't
2780 # recreate the codec entry) and regrtest refleak hunting (which
2781 # runs the same test instance multiple times). This means we
2782 # need to ensure the codecs call back in to the instance to find
2783 # out which exception to raise rather than binding them in a
2784 # closure to an object that may change on the next run
2785 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002786
Nick Coghlan4e553e22013-11-16 00:35:34 +10002787 def tearDown(self):
2788 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002789 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2790 encodings._cache.pop(self.codec_name, None)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002791
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002792 def set_codec(self, encode, decode):
2793 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002794 name=self.codec_name)
2795 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002796
2797 @contextlib.contextmanager
2798 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002799 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002800 operation, self.codec_name, exc_type.__name__, msg)
2801 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2802 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002803 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002804 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002805
2806 def raise_obj(self, *args, **kwds):
2807 # Helper to dynamically change the object raised by a test codec
2808 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002809
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002810 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002811 self.obj_to_raise = obj_to_raise
2812 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002813 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002814 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002815 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002816 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002817 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002818 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002819 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002820 codecs.decode(b"bytes input", self.codec_name)
2821
2822 def test_raise_by_type(self):
2823 self.check_wrapped(RuntimeError, "")
2824
2825 def test_raise_by_value(self):
2826 msg = "This should be wrapped"
2827 self.check_wrapped(RuntimeError(msg), msg)
2828
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002829 def test_raise_grandchild_subclass_exact_size(self):
2830 msg = "This should be wrapped"
2831 class MyRuntimeError(RuntimeError):
2832 __slots__ = ()
2833 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2834
2835 def test_raise_subclass_with_weakref_support(self):
2836 msg = "This should be wrapped"
2837 class MyRuntimeError(RuntimeError):
2838 pass
2839 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2840
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002841 def check_not_wrapped(self, obj_to_raise, msg):
2842 def raise_obj(*args, **kwds):
2843 raise obj_to_raise
2844 self.set_codec(raise_obj, raise_obj)
2845 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002846 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002847 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002848 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002849 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002850 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002851 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002852 codecs.decode(b"bytes input", self.codec_name)
2853
2854 def test_init_override_is_not_wrapped(self):
2855 class CustomInit(RuntimeError):
2856 def __init__(self):
2857 pass
2858 self.check_not_wrapped(CustomInit, "")
2859
2860 def test_new_override_is_not_wrapped(self):
2861 class CustomNew(RuntimeError):
2862 def __new__(cls):
2863 return super().__new__(cls)
2864 self.check_not_wrapped(CustomNew, "")
2865
2866 def test_instance_attribute_is_not_wrapped(self):
2867 msg = "This should NOT be wrapped"
2868 exc = RuntimeError(msg)
2869 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002870 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002871
2872 def test_non_str_arg_is_not_wrapped(self):
2873 self.check_not_wrapped(RuntimeError(1), "1")
2874
2875 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002876 msg_re = r"^\('a', 'b', 'c'\)$"
2877 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002878
2879 # http://bugs.python.org/issue19609
2880 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002881 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002882 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002883 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002884 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002885 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002886 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002887 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002888 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002889 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002890 codecs.decode(b"bytes input", self.codec_name)
2891
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002892 def test_unflagged_non_text_codec_handling(self):
2893 # The stdlib non-text codecs are now marked so they're
2894 # pre-emptively skipped by the text model related methods
2895 # However, third party codecs won't be flagged, so we still make
2896 # sure the case where an inappropriate output type is produced is
2897 # handled appropriately
2898 def encode_to_str(*args, **kwds):
2899 return "not bytes!", 0
2900 def decode_to_bytes(*args, **kwds):
2901 return b"not str!", 0
2902 self.set_codec(encode_to_str, decode_to_bytes)
2903 # No input or output type checks on the codecs module functions
2904 encoded = codecs.encode(None, self.codec_name)
2905 self.assertEqual(encoded, "not bytes!")
2906 decoded = codecs.decode(None, self.codec_name)
2907 self.assertEqual(decoded, b"not str!")
2908 # Text model methods should complain
2909 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
R David Murray44b548d2016-09-08 13:59:53 -04002910 r"use codecs.encode\(\) to encode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002911 msg = fmt.format(self.codec_name)
2912 with self.assertRaisesRegex(TypeError, msg):
2913 "str_input".encode(self.codec_name)
2914 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
R David Murray44b548d2016-09-08 13:59:53 -04002915 r"use codecs.decode\(\) to decode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002916 msg = fmt.format(self.codec_name)
2917 with self.assertRaisesRegex(TypeError, msg):
2918 b"bytes input".decode(self.codec_name)
2919
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002920
Georg Brandl02524622010-12-02 18:06:51 +00002921
Victor Stinner62be4fb2011-10-18 21:46:37 +02002922@unittest.skipUnless(sys.platform == 'win32',
2923 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002924class CodePageTest(unittest.TestCase):
2925 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002926
Victor Stinner3a50e702011-10-18 21:21:00 +02002927 def test_invalid_code_page(self):
2928 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2929 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002930 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2931 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002932
2933 def test_code_page_name(self):
2934 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2935 codecs.code_page_encode, 932, '\xff')
2936 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002937 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002938 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002939 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002940
2941 def check_decode(self, cp, tests):
2942 for raw, errors, expected in tests:
2943 if expected is not None:
2944 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01002945 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002946 except UnicodeDecodeError as err:
2947 self.fail('Unable to decode %a from "cp%s" with '
2948 'errors=%r: %s' % (raw, cp, errors, err))
2949 self.assertEqual(decoded[0], expected,
2950 '%a.decode("cp%s", %r)=%a != %a'
2951 % (raw, cp, errors, decoded[0], expected))
2952 # assert 0 <= decoded[1] <= len(raw)
2953 self.assertGreaterEqual(decoded[1], 0)
2954 self.assertLessEqual(decoded[1], len(raw))
2955 else:
2956 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01002957 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002958
2959 def check_encode(self, cp, tests):
2960 for text, errors, expected in tests:
2961 if expected is not None:
2962 try:
2963 encoded = codecs.code_page_encode(cp, text, errors)
2964 except UnicodeEncodeError as err:
2965 self.fail('Unable to encode %a to "cp%s" with '
2966 'errors=%r: %s' % (text, cp, errors, err))
2967 self.assertEqual(encoded[0], expected,
2968 '%a.encode("cp%s", %r)=%a != %a'
2969 % (text, cp, errors, encoded[0], expected))
2970 self.assertEqual(encoded[1], len(text))
2971 else:
2972 self.assertRaises(UnicodeEncodeError,
2973 codecs.code_page_encode, cp, text, errors)
2974
2975 def test_cp932(self):
2976 self.check_encode(932, (
2977 ('abc', 'strict', b'abc'),
2978 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002979 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002980 ('\xff', 'strict', None),
2981 ('[\xff]', 'ignore', b'[]'),
2982 ('[\xff]', 'replace', b'[y]'),
2983 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002984 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02002985 ('[\xff]', 'namereplace',
2986 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002987 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002988 ('\udcff', 'strict', None),
2989 ('[\udcff]', 'surrogateescape', b'[\xff]'),
2990 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002991 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002992 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002993 (b'abc', 'strict', 'abc'),
2994 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2995 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002996 (b'[\xff]', 'strict', None),
2997 (b'[\xff]', 'ignore', '[]'),
2998 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002999 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003000 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003001 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003002 (b'\x81\x00abc', 'strict', None),
3003 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003004 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01003005 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003006 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003007
3008 def test_cp1252(self):
3009 self.check_encode(1252, (
3010 ('abc', 'strict', b'abc'),
3011 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
3012 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003013 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003014 ('\u0141', 'strict', None),
3015 ('\u0141', 'ignore', b''),
3016 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003017 ('\udc98', 'surrogateescape', b'\x98'),
3018 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003019 ))
3020 self.check_decode(1252, (
3021 (b'abc', 'strict', 'abc'),
3022 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3023 (b'\xff', 'strict', '\xff'),
3024 ))
3025
3026 def test_cp_utf7(self):
3027 cp = 65000
3028 self.check_encode(cp, (
3029 ('abc', 'strict', b'abc'),
3030 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
3031 ('\U0010ffff', 'strict', b'+2//f/w-'),
3032 ('\udc80', 'strict', b'+3IA-'),
3033 ('\ufffd', 'strict', b'+//0-'),
3034 ))
3035 self.check_decode(cp, (
3036 (b'abc', 'strict', 'abc'),
3037 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3038 (b'+2//f/w-', 'strict', '\U0010ffff'),
3039 (b'+3IA-', 'strict', '\udc80'),
3040 (b'+//0-', 'strict', '\ufffd'),
3041 # invalid bytes
3042 (b'[+/]', 'strict', '[]'),
3043 (b'[\xff]', 'strict', '[\xff]'),
3044 ))
3045
Victor Stinner3a50e702011-10-18 21:21:00 +02003046 def test_multibyte_encoding(self):
3047 self.check_decode(932, (
3048 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3049 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3050 ))
3051 self.check_decode(self.CP_UTF8, (
3052 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3053 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3054 ))
Steve Dowerf5aba582016-09-06 19:42:27 -07003055 self.check_encode(self.CP_UTF8, (
3056 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3057 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3058 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003059
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003060 def test_code_page_decode_flags(self):
3061 # Issue #36312: For some code pages (e.g. UTF-7) flags for
3062 # MultiByteToWideChar() must be set to 0.
Paul Monson62dfd7d2019-04-25 11:36:45 -07003063 if support.verbose:
3064 sys.stdout.write('\n')
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003065 for cp in (50220, 50221, 50222, 50225, 50227, 50229,
3066 *range(57002, 57011+1), 65000):
Paul Monson62dfd7d2019-04-25 11:36:45 -07003067 # On small versions of Windows like Windows IoT
3068 # not all codepages are present.
3069 # A missing codepage causes an OSError exception
3070 # so check for the codepage before decoding
3071 if is_code_page_present(cp):
3072 self.assertEqual(codecs.code_page_decode(cp, b'abc'), ('abc', 3), f'cp{cp}')
3073 else:
3074 if support.verbose:
3075 print(f" skipping cp={cp}")
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003076 self.assertEqual(codecs.code_page_decode(42, b'abc'),
3077 ('\uf061\uf062\uf063', 3))
3078
Victor Stinner3a50e702011-10-18 21:21:00 +02003079 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003080 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3081 self.assertEqual(decoded, ('', 0))
3082
Victor Stinner3a50e702011-10-18 21:21:00 +02003083 decoded = codecs.code_page_decode(932,
3084 b'\xe9\x80\xe9', 'strict',
3085 False)
3086 self.assertEqual(decoded, ('\u9a3e', 2))
3087
3088 decoded = codecs.code_page_decode(932,
3089 b'\xe9\x80\xe9\x80', 'strict',
3090 False)
3091 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3092
3093 decoded = codecs.code_page_decode(932,
3094 b'abc', 'strict',
3095 False)
3096 self.assertEqual(decoded, ('abc', 3))
3097
Steve Dowerf5aba582016-09-06 19:42:27 -07003098 def test_mbcs_alias(self):
3099 # Check that looking up our 'default' codepage will return
3100 # mbcs when we don't have a more specific one available
Victor Stinner91106cd2017-12-13 12:29:09 +01003101 with mock.patch('_winapi.GetACP', return_value=123):
Steve Dowerf5aba582016-09-06 19:42:27 -07003102 codec = codecs.lookup('cp123')
3103 self.assertEqual(codec.name, 'mbcs')
Steve Dowerf5aba582016-09-06 19:42:27 -07003104
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003105 @support.bigmemtest(size=2**31, memuse=7, dry_run=False)
Steve Dower7ebdda02019-08-21 16:22:33 -07003106 def test_large_input(self, size):
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003107 # Test input longer than INT_MAX.
3108 # Input should contain undecodable bytes before and after
3109 # the INT_MAX limit.
Steve Dower7ebdda02019-08-21 16:22:33 -07003110 encoded = (b'01234567' * ((size//8)-1) +
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003111 b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff')
Steve Dower7ebdda02019-08-21 16:22:33 -07003112 self.assertEqual(len(encoded), size+2)
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003113 decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True)
3114 self.assertEqual(decoded[1], len(encoded))
3115 del encoded
3116 self.assertEqual(len(decoded[0]), decoded[1])
3117 self.assertEqual(decoded[0][:10], '0123456701')
3118 self.assertEqual(decoded[0][-20:],
3119 '6701234567'
3120 '\udc85\udc86\udcea\udceb\udcec'
3121 '\udcef\udcfc\udcfd\udcfe\udcff')
3122
Steve Dower7ebdda02019-08-21 16:22:33 -07003123 @support.bigmemtest(size=2**31, memuse=6, dry_run=False)
3124 def test_large_utf8_input(self, size):
3125 # Test input longer than INT_MAX.
3126 # Input should contain a decodable multi-byte character
3127 # surrounding INT_MAX
3128 encoded = (b'0123456\xed\x84\x80' * (size//8))
3129 self.assertEqual(len(encoded), size // 8 * 10)
3130 decoded = codecs.code_page_decode(65001, encoded, 'ignore', True)
3131 self.assertEqual(decoded[1], len(encoded))
3132 del encoded
3133 self.assertEqual(len(decoded[0]), size)
3134 self.assertEqual(decoded[0][:10], '0123456\ud10001')
3135 self.assertEqual(decoded[0][-11:], '56\ud1000123456\ud100')
3136
Victor Stinner3a50e702011-10-18 21:21:00 +02003137
Victor Stinnerf96418d2015-09-21 23:06:27 +02003138class ASCIITest(unittest.TestCase):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003139 def test_encode(self):
3140 self.assertEqual('abc123'.encode('ascii'), b'abc123')
3141
3142 def test_encode_error(self):
3143 for data, error_handler, expected in (
3144 ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3145 ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3146 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003147 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3148 b'[\\x80\\xff\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003149 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3150 ):
3151 with self.subTest(data=data, error_handler=error_handler,
3152 expected=expected):
3153 self.assertEqual(data.encode('ascii', error_handler),
3154 expected)
3155
3156 def test_encode_surrogateescape_error(self):
3157 with self.assertRaises(UnicodeEncodeError):
3158 # the first character can be decoded, but not the second
3159 '\udc80\xff'.encode('ascii', 'surrogateescape')
3160
Victor Stinnerf96418d2015-09-21 23:06:27 +02003161 def test_decode(self):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003162 self.assertEqual(b'abc'.decode('ascii'), 'abc')
3163
3164 def test_decode_error(self):
Victor Stinnerf96418d2015-09-21 23:06:27 +02003165 for data, error_handler, expected in (
3166 (b'[\x80\xff]', 'ignore', '[]'),
3167 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3168 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3169 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3170 ):
3171 with self.subTest(data=data, error_handler=error_handler,
3172 expected=expected):
3173 self.assertEqual(data.decode('ascii', error_handler),
3174 expected)
3175
3176
Victor Stinnerc3713e92015-09-29 12:32:13 +02003177class Latin1Test(unittest.TestCase):
3178 def test_encode(self):
3179 for data, expected in (
3180 ('abc', b'abc'),
3181 ('\x80\xe9\xff', b'\x80\xe9\xff'),
3182 ):
3183 with self.subTest(data=data, expected=expected):
3184 self.assertEqual(data.encode('latin1'), expected)
3185
3186 def test_encode_errors(self):
3187 for data, error_handler, expected in (
3188 ('[\u20ac\udc80]', 'ignore', b'[]'),
3189 ('[\u20ac\udc80]', 'replace', b'[??]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003190 ('[\u20ac\U000abcde]', 'backslashreplace',
3191 b'[\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003192 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3193 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3194 ):
3195 with self.subTest(data=data, error_handler=error_handler,
3196 expected=expected):
3197 self.assertEqual(data.encode('latin1', error_handler),
3198 expected)
3199
3200 def test_encode_surrogateescape_error(self):
3201 with self.assertRaises(UnicodeEncodeError):
3202 # the first character can be decoded, but not the second
3203 '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3204
3205 def test_decode(self):
3206 for data, expected in (
3207 (b'abc', 'abc'),
3208 (b'[\x80\xff]', '[\x80\xff]'),
3209 ):
3210 with self.subTest(data=data, expected=expected):
3211 self.assertEqual(data.decode('latin1'), expected)
3212
3213
Jelle Zijlstrab3be4072019-05-22 08:18:26 -07003214class StreamRecoderTest(unittest.TestCase):
3215 def test_writelines(self):
3216 bio = io.BytesIO()
3217 codec = codecs.lookup('ascii')
3218 sr = codecs.StreamRecoder(bio, codec.encode, codec.decode,
3219 encodings.ascii.StreamReader, encodings.ascii.StreamWriter)
3220 sr.writelines([b'a', b'b'])
3221 self.assertEqual(bio.getvalue(), b'ab')
3222
3223 def test_write(self):
3224 bio = io.BytesIO()
3225 codec = codecs.lookup('latin1')
3226 # Recode from Latin-1 to utf-8.
3227 sr = codecs.StreamRecoder(bio, codec.encode, codec.decode,
3228 encodings.utf_8.StreamReader, encodings.utf_8.StreamWriter)
3229
3230 text = 'àñé'
3231 sr.write(text.encode('latin1'))
3232 self.assertEqual(bio.getvalue(), text.encode('utf-8'))
3233
Ammar Askara6ec1ce2019-05-31 12:44:01 -07003234 def test_seeking_read(self):
3235 bio = io.BytesIO('line1\nline2\nline3\n'.encode('utf-16-le'))
3236 sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le')
3237
3238 self.assertEqual(sr.readline(), b'line1\n')
3239 sr.seek(0)
3240 self.assertEqual(sr.readline(), b'line1\n')
3241 self.assertEqual(sr.readline(), b'line2\n')
3242 self.assertEqual(sr.readline(), b'line3\n')
3243 self.assertEqual(sr.readline(), b'')
3244
3245 def test_seeking_write(self):
3246 bio = io.BytesIO('123456789\n'.encode('utf-16-le'))
3247 sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le')
3248
3249 # Test that seek() only resets its internal buffer when offset
3250 # and whence are zero.
3251 sr.seek(2)
3252 sr.write(b'\nabc\n')
3253 self.assertEqual(sr.readline(), b'789\n')
3254 sr.seek(0)
3255 self.assertEqual(sr.readline(), b'1\n')
3256 self.assertEqual(sr.readline(), b'abc\n')
3257 self.assertEqual(sr.readline(), b'789\n')
3258
Jelle Zijlstrab3be4072019-05-22 08:18:26 -07003259
Victor Stinner3d4226a2018-08-29 22:21:32 +02003260@unittest.skipIf(_testcapi is None, 'need _testcapi module')
3261class LocaleCodecTest(unittest.TestCase):
3262 """
3263 Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex().
3264 """
3265 ENCODING = sys.getfilesystemencoding()
3266 STRINGS = ("ascii", "ulatin1:\xa7\xe9",
3267 "u255:\xff",
3268 "UCS:\xe9\u20ac\U0010ffff",
3269 "surrogates:\uDC80\uDCFF")
3270 BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff")
3271 SURROGATES = "\uDC80\uDCFF"
3272
3273 def encode(self, text, errors="strict"):
3274 return _testcapi.EncodeLocaleEx(text, 0, errors)
3275
3276 def check_encode_strings(self, errors):
3277 for text in self.STRINGS:
3278 with self.subTest(text=text):
3279 try:
3280 expected = text.encode(self.ENCODING, errors)
3281 except UnicodeEncodeError:
3282 with self.assertRaises(RuntimeError) as cm:
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003283 self.encode(text, errors)
Victor Stinner3d4226a2018-08-29 22:21:32 +02003284 errmsg = str(cm.exception)
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003285 self.assertRegex(errmsg, r"encode error: pos=[0-9]+, reason=")
Victor Stinner3d4226a2018-08-29 22:21:32 +02003286 else:
3287 encoded = self.encode(text, errors)
3288 self.assertEqual(encoded, expected)
3289
3290 def test_encode_strict(self):
3291 self.check_encode_strings("strict")
3292
3293 def test_encode_surrogateescape(self):
3294 self.check_encode_strings("surrogateescape")
3295
3296 def test_encode_surrogatepass(self):
3297 try:
3298 self.encode('', 'surrogatepass')
3299 except ValueError as exc:
3300 if str(exc) == 'unsupported error handler':
3301 self.skipTest(f"{self.ENCODING!r} encoder doesn't support "
3302 f"surrogatepass error handler")
3303 else:
3304 raise
3305
3306 self.check_encode_strings("surrogatepass")
3307
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003308 def test_encode_unsupported_error_handler(self):
3309 with self.assertRaises(ValueError) as cm:
3310 self.encode('', 'backslashreplace')
3311 self.assertEqual(str(cm.exception), 'unsupported error handler')
3312
Victor Stinner3d4226a2018-08-29 22:21:32 +02003313 def decode(self, encoded, errors="strict"):
3314 return _testcapi.DecodeLocaleEx(encoded, 0, errors)
3315
3316 def check_decode_strings(self, errors):
3317 is_utf8 = (self.ENCODING == "utf-8")
3318 if is_utf8:
3319 encode_errors = 'surrogateescape'
3320 else:
3321 encode_errors = 'strict'
3322
3323 strings = list(self.BYTES_STRINGS)
3324 for text in self.STRINGS:
3325 try:
3326 encoded = text.encode(self.ENCODING, encode_errors)
3327 if encoded not in strings:
3328 strings.append(encoded)
3329 except UnicodeEncodeError:
3330 encoded = None
3331
3332 if is_utf8:
3333 encoded2 = text.encode(self.ENCODING, 'surrogatepass')
3334 if encoded2 != encoded:
3335 strings.append(encoded2)
3336
3337 for encoded in strings:
3338 with self.subTest(encoded=encoded):
3339 try:
3340 expected = encoded.decode(self.ENCODING, errors)
3341 except UnicodeDecodeError:
3342 with self.assertRaises(RuntimeError) as cm:
3343 self.decode(encoded, errors)
3344 errmsg = str(cm.exception)
3345 self.assertTrue(errmsg.startswith("decode error: "), errmsg)
3346 else:
3347 decoded = self.decode(encoded, errors)
3348 self.assertEqual(decoded, expected)
3349
3350 def test_decode_strict(self):
3351 self.check_decode_strings("strict")
3352
3353 def test_decode_surrogateescape(self):
3354 self.check_decode_strings("surrogateescape")
3355
3356 def test_decode_surrogatepass(self):
3357 try:
3358 self.decode(b'', 'surrogatepass')
3359 except ValueError as exc:
3360 if str(exc) == 'unsupported error handler':
3361 self.skipTest(f"{self.ENCODING!r} decoder doesn't support "
3362 f"surrogatepass error handler")
3363 else:
3364 raise
3365
3366 self.check_decode_strings("surrogatepass")
3367
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003368 def test_decode_unsupported_error_handler(self):
3369 with self.assertRaises(ValueError) as cm:
3370 self.decode(b'', 'backslashreplace')
3371 self.assertEqual(str(cm.exception), 'unsupported error handler')
3372
Victor Stinner3d4226a2018-08-29 22:21:32 +02003373
Zethb3b48c82019-09-09 15:50:36 +01003374class Rot13Test(unittest.TestCase):
3375 """Test the educational ROT-13 codec."""
3376 def test_encode(self):
3377 ciphertext = codecs.encode("Caesar liked ciphers", 'rot-13')
3378 self.assertEqual(ciphertext, 'Pnrfne yvxrq pvcuref')
3379
3380 def test_decode(self):
3381 plaintext = codecs.decode('Rg gh, Oehgr?', 'rot-13')
3382 self.assertEqual(plaintext, 'Et tu, Brute?')
3383
3384 def test_incremental_encode(self):
3385 encoder = codecs.getincrementalencoder('rot-13')()
3386 ciphertext = encoder.encode('ABBA nag Cheryl Baker')
3387 self.assertEqual(ciphertext, 'NOON ant Purely Onxre')
3388
3389 def test_incremental_decode(self):
3390 decoder = codecs.getincrementaldecoder('rot-13')()
3391 plaintext = decoder.decode('terra Ares envy tha')
3392 self.assertEqual(plaintext, 'green Nerf rail gun')
3393
3394
3395class Rot13UtilTest(unittest.TestCase):
3396 """Test the ROT-13 codec via rot13 function,
3397 i.e. the user has done something like:
3398 $ echo "Hello World" | python -m encodings.rot_13
3399 """
3400 def test_rot13_func(self):
3401 infile = io.StringIO('Gb or, be abg gb or, gung vf gur dhrfgvba')
3402 outfile = io.StringIO()
3403 encodings.rot_13.rot13(infile, outfile)
3404 outfile.seek(0)
3405 plain_text = outfile.read()
3406 self.assertEqual(
3407 plain_text,
3408 'To be, or not to be, that is the question')
3409
3410
Hai Shi3f342372020-10-09 03:20:57 +08003411class CodecNameNormalizationTest(unittest.TestCase):
3412 """Test codec name normalization"""
Hai Shic5b049b2020-10-14 23:43:31 +08003413 def test_codecs_lookup(self):
Hai Shi3f342372020-10-09 03:20:57 +08003414 FOUND = (1, 2, 3, 4)
3415 NOT_FOUND = (None, None, None, None)
3416 def search_function(encoding):
3417 if encoding == "aaa_8":
3418 return FOUND
3419 else:
3420 return NOT_FOUND
3421
3422 codecs.register(search_function)
3423 self.addCleanup(codecs.unregister, search_function)
3424 self.assertEqual(FOUND, codecs.lookup('aaa_8'))
3425 self.assertEqual(FOUND, codecs.lookup('AAA-8'))
3426 self.assertEqual(FOUND, codecs.lookup('AAA---8'))
3427 self.assertEqual(FOUND, codecs.lookup('AAA 8'))
3428 self.assertEqual(FOUND, codecs.lookup('aaa\xe9\u20ac-8'))
3429 self.assertEqual(NOT_FOUND, codecs.lookup('AAA.8'))
3430 self.assertEqual(NOT_FOUND, codecs.lookup('AAA...8'))
3431 self.assertEqual(NOT_FOUND, codecs.lookup('BBB-8'))
3432 self.assertEqual(NOT_FOUND, codecs.lookup('BBB.8'))
3433 self.assertEqual(NOT_FOUND, codecs.lookup('a\xe9\u20ac-8'))
3434
Hai Shic5b049b2020-10-14 23:43:31 +08003435 def test_encodings_normalize_encoding(self):
3436 # encodings.normalize_encoding() ignores non-ASCII characters.
3437 normalize = encodings.normalize_encoding
3438 self.assertEqual(normalize('utf_8'), 'utf_8')
3439 self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8')
3440 self.assertEqual(normalize('utf 8'), 'utf_8')
3441 # encodings.normalize_encoding() doesn't convert
3442 # characters to lower case.
3443 self.assertEqual(normalize('UTF 8'), 'UTF_8')
3444 self.assertEqual(normalize('utf.8'), 'utf.8')
3445 self.assertEqual(normalize('utf...8'), 'utf...8')
3446
Hai Shi3f342372020-10-09 03:20:57 +08003447
Fred Drake2e2be372001-09-20 21:33:42 +00003448if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003449 unittest.main()