blob: 8d112a171d7c4ef5f2138ff398d2b5270b3cccee [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
Nick Coghlanc72e4e62013-11-22 22:39:36 +10007import encodings
Victor Stinner91106cd2017-12-13 12:29:09 +01008from unittest import mock
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Antoine Pitrou00b2c862011-10-05 13:01:41 +020012try:
Victor Stinner3d4226a2018-08-29 22:21:32 +020013 import _testcapi
Pablo Galindo293dd232019-11-19 21:34:03 +000014except ImportError:
Victor Stinner3d4226a2018-08-29 22:21:32 +020015 _testcapi = None
16
17try:
Antoine Pitrou00b2c862011-10-05 13:01:41 +020018 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Paul Monson62dfd7d2019-04-25 11:36:45 -070030# On small versions of Windows like Windows IoT or Windows Nano Server not all codepages are present
31def is_code_page_present(cp):
Victor Stinner8f4ef3b2019-07-01 18:28:25 +020032 from ctypes import POINTER, WINFUNCTYPE, WinDLL
Paul Monson62dfd7d2019-04-25 11:36:45 -070033 from ctypes.wintypes import BOOL, UINT, BYTE, WCHAR, UINT, DWORD
34
35 MAX_LEADBYTES = 12 # 5 ranges, 2 bytes ea., 0 term.
36 MAX_DEFAULTCHAR = 2 # single or double byte
37 MAX_PATH = 260
38 class CPINFOEXW(ctypes.Structure):
39 _fields_ = [("MaxCharSize", UINT),
40 ("DefaultChar", BYTE*MAX_DEFAULTCHAR),
41 ("LeadByte", BYTE*MAX_LEADBYTES),
42 ("UnicodeDefaultChar", WCHAR),
43 ("CodePage", UINT),
44 ("CodePageName", WCHAR*MAX_PATH)]
45
46 prototype = WINFUNCTYPE(BOOL, UINT, DWORD, POINTER(CPINFOEXW))
47 GetCPInfoEx = prototype(("GetCPInfoExW", WinDLL("kernel32")))
48 info = CPINFOEXW()
49 return GetCPInfoEx(cp, 0, info)
Victor Stinnerf96418d2015-09-21 23:06:27 +020050
Walter Dörwald69652032004-09-07 20:24:22 +000051class Queue(object):
52 """
53 queue: write bytes at one end, read bytes from the other end
54 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000055 def __init__(self, buffer):
56 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000057
58 def write(self, chars):
59 self._buffer += chars
60
61 def read(self, size=-1):
62 if size<0:
63 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000064 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000065 return s
66 else:
67 s = self._buffer[:size]
68 self._buffer = self._buffer[size:]
69 return s
70
Victor Stinnerf96418d2015-09-21 23:06:27 +020071
Walter Dörwald3abcb012007-04-16 22:10:50 +000072class MixInCheckStateHandling:
73 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000074 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000075 d = codecs.getincrementaldecoder(encoding)()
76 part1 = d.decode(s[:i])
77 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000078 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000079 # Check that the condition stated in the documentation for
80 # IncrementalDecoder.getstate() holds
81 if not state[1]:
82 # reset decoder to the default state without anything buffered
83 d.setstate((state[0][:0], 0))
84 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000085 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000086 # The decoder must return to the same state
87 self.assertEqual(state, d.getstate())
88 # Create a new decoder and set it to the state
89 # we extracted from the old one
90 d = codecs.getincrementaldecoder(encoding)()
91 d.setstate(state)
92 part2 = d.decode(s[i:], True)
93 self.assertEqual(u, part1+part2)
94
95 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000096 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000097 d = codecs.getincrementalencoder(encoding)()
98 part1 = d.encode(u[:i])
99 state = d.getstate()
100 d = codecs.getincrementalencoder(encoding)()
101 d.setstate(state)
102 part2 = d.encode(u[i:], True)
103 self.assertEqual(s, part1+part2)
104
Victor Stinnerf96418d2015-09-21 23:06:27 +0200105
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200106class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000107 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +0000108 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000109 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +0000110 # the StreamReader and check that the results equal the appropriate
111 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000112 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200113 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000114 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000115 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000116 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +0000117 result += r.read()
118 self.assertEqual(result, partialresult)
119 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000120 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000121 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +0000122
Martin Panter7462b6492015-11-02 03:37:02 +0000123 # do the check again, this time using an incremental decoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000124 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000125 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000126 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000127 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000128 self.assertEqual(result, partialresult)
129 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000130 self.assertEqual(d.decode(b"", True), "")
131 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000132
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000133 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000134 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000135 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000136 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000137 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000138 self.assertEqual(result, partialresult)
139 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000140 self.assertEqual(d.decode(b"", True), "")
141 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000142
143 # check iterdecode()
144 encoded = input.encode(self.encoding)
145 self.assertEqual(
146 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000147 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000148 )
149
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000150 def test_readline(self):
151 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000152 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000153 return codecs.getreader(self.encoding)(stream)
154
Walter Dörwaldca199432006-03-06 22:39:12 +0000155 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200156 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000157 lines = []
158 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000159 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000160 if not line:
161 break
162 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000163 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000164
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000165 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
166 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
167 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000168 self.assertEqual(readalllines(s, True), sexpected)
169 self.assertEqual(readalllines(s, False), sexpectednoends)
170 self.assertEqual(readalllines(s, True, 10), sexpected)
171 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000172
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200173 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000174 # Test long lines (multiple calls to read() in readline())
175 vw = []
176 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200177 for (i, lineend) in enumerate(lineends):
178 vw.append((i*200+200)*"\u3042" + lineend)
179 vwo.append((i*200+200)*"\u3042")
180 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
181 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000182
183 # Test lines where the first read might end with \r, so the
184 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000185 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200186 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000187 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000188 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000189 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000190 self.assertEqual(
191 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000192 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000193 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200194 self.assertEqual(
195 reader.readline(keepends=True),
196 "xxx\n",
197 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000198 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000199 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000200 self.assertEqual(
201 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000202 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000203 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200204 self.assertEqual(
205 reader.readline(keepends=False),
206 "xxx",
207 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000208
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200209 def test_mixed_readline_and_read(self):
210 lines = ["Humpty Dumpty sat on a wall,\n",
211 "Humpty Dumpty had a great fall.\r\n",
212 "All the king's horses and all the king's men\r",
213 "Couldn't put Humpty together again."]
214 data = ''.join(lines)
215 def getreader():
216 stream = io.BytesIO(data.encode(self.encoding))
217 return codecs.getreader(self.encoding)(stream)
218
219 # Issue #8260: Test readline() followed by read()
220 f = getreader()
221 self.assertEqual(f.readline(), lines[0])
222 self.assertEqual(f.read(), ''.join(lines[1:]))
223 self.assertEqual(f.read(), '')
224
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200225 # Issue #32110: Test readline() followed by read(n)
226 f = getreader()
227 self.assertEqual(f.readline(), lines[0])
228 self.assertEqual(f.read(1), lines[1][0])
229 self.assertEqual(f.read(0), '')
230 self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100])
231
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200232 # Issue #16636: Test readline() followed by readlines()
233 f = getreader()
234 self.assertEqual(f.readline(), lines[0])
235 self.assertEqual(f.readlines(), lines[1:])
236 self.assertEqual(f.read(), '')
237
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200238 # Test read(n) followed by read()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200239 f = getreader()
240 self.assertEqual(f.read(size=40, chars=5), data[:5])
241 self.assertEqual(f.read(), data[5:])
242 self.assertEqual(f.read(), '')
243
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200244 # Issue #32110: Test read(n) followed by read(n)
245 f = getreader()
246 self.assertEqual(f.read(size=40, chars=5), data[:5])
247 self.assertEqual(f.read(1), data[5])
248 self.assertEqual(f.read(0), '')
249 self.assertEqual(f.read(100), data[6:106])
250
251 # Issue #12446: Test read(n) followed by readlines()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200252 f = getreader()
253 self.assertEqual(f.read(size=40, chars=5), data[:5])
254 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
255 self.assertEqual(f.read(), '')
256
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000257 def test_bug1175396(self):
258 s = [
259 '<%!--===================================================\r\n',
260 ' BLOG index page: show recent articles,\r\n',
261 ' today\'s articles, or articles of a specific date.\r\n',
262 '========================================================--%>\r\n',
263 '<%@inputencoding="ISO-8859-1"%>\r\n',
264 '<%@pagetemplate=TEMPLATE.y%>\r\n',
265 '<%@import=import frog.util, frog%>\r\n',
266 '<%@import=import frog.objects%>\r\n',
267 '<%@import=from frog.storageerrors import StorageError%>\r\n',
268 '<%\r\n',
269 '\r\n',
270 'import logging\r\n',
271 'log=logging.getLogger("Snakelets.logger")\r\n',
272 '\r\n',
273 '\r\n',
274 'user=self.SessionCtx.user\r\n',
275 'storageEngine=self.SessionCtx.storageEngine\r\n',
276 '\r\n',
277 '\r\n',
278 'def readArticlesFromDate(date, count=None):\r\n',
279 ' entryids=storageEngine.listBlogEntries(date)\r\n',
280 ' entryids.reverse() # descending\r\n',
281 ' if count:\r\n',
282 ' entryids=entryids[:count]\r\n',
283 ' try:\r\n',
284 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
285 ' except StorageError,x:\r\n',
286 ' log.error("Error loading articles: "+str(x))\r\n',
287 ' self.abort("cannot load articles")\r\n',
288 '\r\n',
289 'showdate=None\r\n',
290 '\r\n',
291 'arg=self.Request.getArg()\r\n',
292 'if arg=="today":\r\n',
293 ' #-------------------- TODAY\'S ARTICLES\r\n',
294 ' self.write("<h2>Today\'s articles</h2>")\r\n',
295 ' showdate = frog.util.isodatestr() \r\n',
296 ' entries = readArticlesFromDate(showdate)\r\n',
297 'elif arg=="active":\r\n',
298 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
299 ' self.Yredirect("active.y")\r\n',
300 'elif arg=="login":\r\n',
301 ' #-------------------- LOGIN PAGE redirect\r\n',
302 ' self.Yredirect("login.y")\r\n',
303 'elif arg=="date":\r\n',
304 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
305 ' showdate = self.Request.getParameter("date")\r\n',
306 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
307 ' entries = readArticlesFromDate(showdate)\r\n',
308 'else:\r\n',
309 ' #-------------------- RECENT ARTICLES\r\n',
310 ' self.write("<h2>Recent articles</h2>")\r\n',
311 ' dates=storageEngine.listBlogEntryDates()\r\n',
312 ' if dates:\r\n',
313 ' entries=[]\r\n',
314 ' SHOWAMOUNT=10\r\n',
315 ' for showdate in dates:\r\n',
316 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
317 ' if len(entries)>=SHOWAMOUNT:\r\n',
318 ' break\r\n',
319 ' \r\n',
320 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000321 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200322 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000323 for (i, line) in enumerate(reader):
324 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000325
326 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000327 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200328 writer = codecs.getwriter(self.encoding)(q)
329 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000330
331 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000332 writer.write("foo\r")
333 self.assertEqual(reader.readline(keepends=False), "foo")
334 writer.write("\nbar\r")
335 self.assertEqual(reader.readline(keepends=False), "")
336 self.assertEqual(reader.readline(keepends=False), "bar")
337 writer.write("baz")
338 self.assertEqual(reader.readline(keepends=False), "baz")
339 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000340
341 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000342 writer.write("foo\r")
343 self.assertEqual(reader.readline(keepends=True), "foo\r")
344 writer.write("\nbar\r")
345 self.assertEqual(reader.readline(keepends=True), "\n")
346 self.assertEqual(reader.readline(keepends=True), "bar\r")
347 writer.write("baz")
348 self.assertEqual(reader.readline(keepends=True), "baz")
349 self.assertEqual(reader.readline(keepends=True), "")
350 writer.write("foo\r\n")
351 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000352
Walter Dörwald9fa09462005-01-10 12:01:39 +0000353 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000354 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
355 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
356 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000357
358 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000359 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200360 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000361 self.assertEqual(reader.readline(), s1)
362 self.assertEqual(reader.readline(), s2)
363 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000364 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000365
366 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000367 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
368 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
369 s3 = "stillokay:bbbbxx\r\n"
370 s4 = "broken!!!!badbad\r\n"
371 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000372
373 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000374 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200375 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000376 self.assertEqual(reader.readline(), s1)
377 self.assertEqual(reader.readline(), s2)
378 self.assertEqual(reader.readline(), s3)
379 self.assertEqual(reader.readline(), s4)
380 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000381 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000382
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200383 ill_formed_sequence_replace = "\ufffd"
384
385 def test_lone_surrogates(self):
386 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
387 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
388 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200389 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
390 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200391 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
392 "[&#56448;]".encode(self.encoding))
393 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
394 "[]".encode(self.encoding))
395 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
396 "[?]".encode(self.encoding))
397
Victor Stinner01ada392015-10-01 21:54:51 +0200398 # sequential surrogate characters
399 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
400 "[]".encode(self.encoding))
401 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
402 "[??]".encode(self.encoding))
403
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200404 bom = "".encode(self.encoding)
405 for before, after in [("\U00010fff", "A"), ("[", "]"),
406 ("A", "\U00010fff")]:
407 before_sequence = before.encode(self.encoding)[len(bom):]
408 after_sequence = after.encode(self.encoding)[len(bom):]
409 test_string = before + "\uDC80" + after
410 test_sequence = (bom + before_sequence +
411 self.ill_formed_sequence + after_sequence)
412 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
413 self.encoding)
414 self.assertEqual(test_string.encode(self.encoding,
415 "surrogatepass"),
416 test_sequence)
417 self.assertEqual(test_sequence.decode(self.encoding,
418 "surrogatepass"),
419 test_string)
420 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
421 before + after)
422 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
423 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200424 backslashreplace = ''.join('\\x%02x' % b
425 for b in self.ill_formed_sequence)
426 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
427 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200428
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +0200429 def test_incremental_surrogatepass(self):
430 # Test incremental decoder for surrogatepass handler:
431 # see issue #24214
Serhiy Storchaka894263b2019-06-25 11:54:18 +0300432 # High surrogate
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +0200433 data = '\uD901'.encode(self.encoding, 'surrogatepass')
434 for i in range(1, len(data)):
435 dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
436 self.assertEqual(dec.decode(data[:i]), '')
437 self.assertEqual(dec.decode(data[i:], True), '\uD901')
Serhiy Storchaka894263b2019-06-25 11:54:18 +0300438 # Low surrogate
439 data = '\uDC02'.encode(self.encoding, 'surrogatepass')
440 for i in range(1, len(data)):
441 dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
442 self.assertEqual(dec.decode(data[:i]), '')
443 self.assertEqual(dec.decode(data[i:]), '\uDC02')
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +0200444
Victor Stinnerf96418d2015-09-21 23:06:27 +0200445
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200446class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000447 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200448 if sys.byteorder == 'little':
449 ill_formed_sequence = b"\x80\xdc\x00\x00"
450 else:
451 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000452
453 spamle = (b'\xff\xfe\x00\x00'
454 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
455 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
456 spambe = (b'\x00\x00\xfe\xff'
457 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
458 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
459
460 def test_only_one_bom(self):
461 _,_,reader,writer = codecs.lookup(self.encoding)
462 # encode some stream
463 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200464 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000465 f.write("spam")
466 f.write("spam")
467 d = s.getvalue()
468 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000469 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000470 # try to read it back
471 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200472 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000473 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000474
475 def test_badbom(self):
476 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200477 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000478 self.assertRaises(UnicodeError, f.read)
479
480 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200481 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000482 self.assertRaises(UnicodeError, f.read)
483
484 def test_partial(self):
485 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200486 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000487 [
488 "", # first byte of BOM read
489 "", # second byte of BOM read
490 "", # third byte of BOM read
491 "", # fourth byte of BOM read => byteorder known
492 "",
493 "",
494 "",
495 "\x00",
496 "\x00",
497 "\x00",
498 "\x00",
499 "\x00\xff",
500 "\x00\xff",
501 "\x00\xff",
502 "\x00\xff",
503 "\x00\xff\u0100",
504 "\x00\xff\u0100",
505 "\x00\xff\u0100",
506 "\x00\xff\u0100",
507 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200508 "\x00\xff\u0100\uffff",
509 "\x00\xff\u0100\uffff",
510 "\x00\xff\u0100\uffff",
511 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000512 ]
513 )
514
Georg Brandl791f4e12009-09-17 11:41:24 +0000515 def test_handlers(self):
516 self.assertEqual(('\ufffd', 1),
517 codecs.utf_32_decode(b'\x01', 'replace', True))
518 self.assertEqual(('', 1),
519 codecs.utf_32_decode(b'\x01', 'ignore', True))
520
Walter Dörwald41980ca2007-08-16 21:55:45 +0000521 def test_errors(self):
522 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
523 b"\xff", "strict", True)
524
525 def test_decoder_state(self):
526 self.check_state_handling_decode(self.encoding,
527 "spamspam", self.spamle)
528 self.check_state_handling_decode(self.encoding,
529 "spamspam", self.spambe)
530
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000531 def test_issue8941(self):
532 # Issue #8941: insufficient result allocation when decoding into
533 # surrogate pairs on UCS-2 builds.
534 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
535 self.assertEqual('\U00010000' * 1024,
536 codecs.utf_32_decode(encoded_le)[0])
537 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
538 self.assertEqual('\U00010000' * 1024,
539 codecs.utf_32_decode(encoded_be)[0])
540
Victor Stinnerf96418d2015-09-21 23:06:27 +0200541
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200542class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000543 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200544 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000545
546 def test_partial(self):
547 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200548 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000549 [
550 "",
551 "",
552 "",
553 "\x00",
554 "\x00",
555 "\x00",
556 "\x00",
557 "\x00\xff",
558 "\x00\xff",
559 "\x00\xff",
560 "\x00\xff",
561 "\x00\xff\u0100",
562 "\x00\xff\u0100",
563 "\x00\xff\u0100",
564 "\x00\xff\u0100",
565 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200566 "\x00\xff\u0100\uffff",
567 "\x00\xff\u0100\uffff",
568 "\x00\xff\u0100\uffff",
569 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000570 ]
571 )
572
573 def test_simple(self):
574 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
575
576 def test_errors(self):
577 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
578 b"\xff", "strict", True)
579
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000580 def test_issue8941(self):
581 # Issue #8941: insufficient result allocation when decoding into
582 # surrogate pairs on UCS-2 builds.
583 encoded = b'\x00\x00\x01\x00' * 1024
584 self.assertEqual('\U00010000' * 1024,
585 codecs.utf_32_le_decode(encoded)[0])
586
Victor Stinnerf96418d2015-09-21 23:06:27 +0200587
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200588class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000589 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200590 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000591
592 def test_partial(self):
593 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200594 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000595 [
596 "",
597 "",
598 "",
599 "\x00",
600 "\x00",
601 "\x00",
602 "\x00",
603 "\x00\xff",
604 "\x00\xff",
605 "\x00\xff",
606 "\x00\xff",
607 "\x00\xff\u0100",
608 "\x00\xff\u0100",
609 "\x00\xff\u0100",
610 "\x00\xff\u0100",
611 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200612 "\x00\xff\u0100\uffff",
613 "\x00\xff\u0100\uffff",
614 "\x00\xff\u0100\uffff",
615 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000616 ]
617 )
618
619 def test_simple(self):
620 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
621
622 def test_errors(self):
623 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
624 b"\xff", "strict", True)
625
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000626 def test_issue8941(self):
627 # Issue #8941: insufficient result allocation when decoding into
628 # surrogate pairs on UCS-2 builds.
629 encoded = b'\x00\x01\x00\x00' * 1024
630 self.assertEqual('\U00010000' * 1024,
631 codecs.utf_32_be_decode(encoded)[0])
632
633
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200634class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000635 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200636 if sys.byteorder == 'little':
637 ill_formed_sequence = b"\x80\xdc"
638 else:
639 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000640
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000641 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
642 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000643
644 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000645 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000646 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000647 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200648 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000649 f.write("spam")
650 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000651 d = s.getvalue()
652 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000653 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000654 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000655 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200656 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000657 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000658
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000659 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000660 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200661 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000662 self.assertRaises(UnicodeError, f.read)
663
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000664 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200665 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000666 self.assertRaises(UnicodeError, f.read)
667
Walter Dörwald69652032004-09-07 20:24:22 +0000668 def test_partial(self):
669 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200670 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000671 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000672 "", # first byte of BOM read
673 "", # second byte of BOM read => byteorder known
674 "",
675 "\x00",
676 "\x00",
677 "\x00\xff",
678 "\x00\xff",
679 "\x00\xff\u0100",
680 "\x00\xff\u0100",
681 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200682 "\x00\xff\u0100\uffff",
683 "\x00\xff\u0100\uffff",
684 "\x00\xff\u0100\uffff",
685 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000686 ]
687 )
688
Georg Brandl791f4e12009-09-17 11:41:24 +0000689 def test_handlers(self):
690 self.assertEqual(('\ufffd', 1),
691 codecs.utf_16_decode(b'\x01', 'replace', True))
692 self.assertEqual(('', 1),
693 codecs.utf_16_decode(b'\x01', 'ignore', True))
694
Walter Dörwalde22d3392005-11-17 08:52:34 +0000695 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000696 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000697 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000698
699 def test_decoder_state(self):
700 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000701 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000702 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000703 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000704
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000705 def test_bug691291(self):
706 # Files are always opened in binary mode, even if no binary mode was
707 # specified. This means that no automatic conversion of '\n' is done
708 # on reading and writing.
709 s1 = 'Hello\r\nworld\r\n'
710
711 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200712 self.addCleanup(support.unlink, support.TESTFN)
713 with open(support.TESTFN, 'wb') as fp:
714 fp.write(s)
Victor Stinner942f7a22020-03-04 18:50:22 +0100715 with support.check_warnings(('', DeprecationWarning)):
716 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
717 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200718 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000719
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200720class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000721 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200722 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000723
724 def test_partial(self):
725 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200726 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000727 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000728 "",
729 "\x00",
730 "\x00",
731 "\x00\xff",
732 "\x00\xff",
733 "\x00\xff\u0100",
734 "\x00\xff\u0100",
735 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200736 "\x00\xff\u0100\uffff",
737 "\x00\xff\u0100\uffff",
738 "\x00\xff\u0100\uffff",
739 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000740 ]
741 )
742
Walter Dörwalde22d3392005-11-17 08:52:34 +0000743 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200744 tests = [
745 (b'\xff', '\ufffd'),
746 (b'A\x00Z', 'A\ufffd'),
747 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
748 (b'\x00\xd8', '\ufffd'),
749 (b'\x00\xd8A', '\ufffd'),
750 (b'\x00\xd8A\x00', '\ufffdA'),
751 (b'\x00\xdcA\x00', '\ufffdA'),
752 ]
753 for raw, expected in tests:
754 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
755 raw, 'strict', True)
756 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000757
Victor Stinner53a9dd72010-12-08 22:25:45 +0000758 def test_nonbmp(self):
759 self.assertEqual("\U00010203".encode(self.encoding),
760 b'\x00\xd8\x03\xde')
761 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
762 "\U00010203")
763
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200764class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000765 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200766 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000767
768 def test_partial(self):
769 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200770 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000771 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000772 "",
773 "\x00",
774 "\x00",
775 "\x00\xff",
776 "\x00\xff",
777 "\x00\xff\u0100",
778 "\x00\xff\u0100",
779 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200780 "\x00\xff\u0100\uffff",
781 "\x00\xff\u0100\uffff",
782 "\x00\xff\u0100\uffff",
783 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000784 ]
785 )
786
Walter Dörwalde22d3392005-11-17 08:52:34 +0000787 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200788 tests = [
789 (b'\xff', '\ufffd'),
790 (b'\x00A\xff', 'A\ufffd'),
791 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
792 (b'\xd8\x00', '\ufffd'),
793 (b'\xd8\x00\xdc', '\ufffd'),
794 (b'\xd8\x00\x00A', '\ufffdA'),
795 (b'\xdc\x00\x00A', '\ufffdA'),
796 ]
797 for raw, expected in tests:
798 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
799 raw, 'strict', True)
800 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000801
Victor Stinner53a9dd72010-12-08 22:25:45 +0000802 def test_nonbmp(self):
803 self.assertEqual("\U00010203".encode(self.encoding),
804 b'\xd8\x00\xde\x03')
805 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
806 "\U00010203")
807
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200808class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000809 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200810 ill_formed_sequence = b"\xed\xb2\x80"
811 ill_formed_sequence_replace = "\ufffd" * 3
Victor Stinner01ada392015-10-01 21:54:51 +0200812 BOM = b''
Walter Dörwald69652032004-09-07 20:24:22 +0000813
814 def test_partial(self):
815 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200816 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000817 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000818 "\x00",
819 "\x00",
820 "\x00\xff",
821 "\x00\xff",
822 "\x00\xff\u07ff",
823 "\x00\xff\u07ff",
824 "\x00\xff\u07ff",
825 "\x00\xff\u07ff\u0800",
826 "\x00\xff\u07ff\u0800",
827 "\x00\xff\u07ff\u0800",
828 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200829 "\x00\xff\u07ff\u0800\uffff",
830 "\x00\xff\u07ff\u0800\uffff",
831 "\x00\xff\u07ff\u0800\uffff",
832 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000833 ]
834 )
835
Walter Dörwald3abcb012007-04-16 22:10:50 +0000836 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000837 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000838 self.check_state_handling_decode(self.encoding,
839 u, u.encode(self.encoding))
840
Victor Stinner1d65d912015-10-05 13:43:50 +0200841 def test_decode_error(self):
842 for data, error_handler, expected in (
843 (b'[\x80\xff]', 'ignore', '[]'),
844 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
845 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
846 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
847 ):
848 with self.subTest(data=data, error_handler=error_handler,
849 expected=expected):
850 self.assertEqual(data.decode(self.encoding, error_handler),
851 expected)
852
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000853 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200854 super().test_lone_surrogates()
855 # not sure if this is making sense for
856 # UTF-16 and UTF-32
Victor Stinner01ada392015-10-01 21:54:51 +0200857 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
858 self.BOM + b'[\x80]')
859
860 with self.assertRaises(UnicodeEncodeError) as cm:
861 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
862 exc = cm.exception
863 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000864
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000865 def test_surrogatepass_handler(self):
Victor Stinner01ada392015-10-01 21:54:51 +0200866 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
867 self.BOM + b"abc\xed\xa0\x80def")
868 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
869 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
870 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
871 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
872
873 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
Ezio Melottib3aedd42010-11-20 19:04:17 +0000874 "abc\ud800def")
Victor Stinner01ada392015-10-01 21:54:51 +0200875 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200876 "\U00010fff\uD800")
Victor Stinner01ada392015-10-01 21:54:51 +0200877
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000878 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700879 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200880 b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200881 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200882 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000883
Serhiy Storchaka894263b2019-06-25 11:54:18 +0300884 def test_incremental_errors(self):
885 # Test that the incremental decoder can fail with final=False.
886 # See issue #24214
887 cases = [b'\x80', b'\xBF', b'\xC0', b'\xC1', b'\xF5', b'\xF6', b'\xFF']
888 for prefix in (b'\xC2', b'\xDF', b'\xE0', b'\xE0\xA0', b'\xEF',
889 b'\xEF\xBF', b'\xF0', b'\xF0\x90', b'\xF0\x90\x80',
890 b'\xF4', b'\xF4\x8F', b'\xF4\x8F\xBF'):
891 for suffix in b'\x7F', b'\xC0':
892 cases.append(prefix + suffix)
893 cases.extend((b'\xE0\x80', b'\xE0\x9F', b'\xED\xA0\x80',
894 b'\xED\xBF\xBF', b'\xF0\x80', b'\xF0\x8F', b'\xF4\x90'))
895
896 for data in cases:
897 with self.subTest(data=data):
898 dec = codecs.getincrementaldecoder(self.encoding)()
899 self.assertRaises(UnicodeDecodeError, dec.decode, data)
900
Victor Stinnerf96418d2015-09-21 23:06:27 +0200901
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200902class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000903 encoding = "utf-7"
904
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300905 def test_ascii(self):
906 # Set D (directly encoded characters)
907 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
908 'abcdefghijklmnopqrstuvwxyz'
909 '0123456789'
910 '\'(),-./:?')
911 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
912 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
913 # Set O (optional direct characters)
914 set_o = ' !"#$%&*;<=>@[]^_`{|}'
915 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
916 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
917 # +
918 self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
919 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
920 # White spaces
921 ws = ' \t\n\r'
922 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
923 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
924 # Other ASCII characters
925 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
926 set(set_d + set_o + '+' + ws)))
927 self.assertEqual(other_ascii.encode(self.encoding),
928 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
929 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
930
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000931 def test_partial(self):
932 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200933 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000934 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200935 'a',
936 'a',
937 'a+',
938 'a+-',
939 'a+-b',
940 'a+-b',
941 'a+-b',
942 'a+-b',
943 'a+-b',
944 'a+-b\x00',
945 'a+-b\x00c',
946 'a+-b\x00c',
947 'a+-b\x00c',
948 'a+-b\x00c',
949 'a+-b\x00c',
950 'a+-b\x00c\x80',
951 'a+-b\x00c\x80d',
952 'a+-b\x00c\x80d',
953 'a+-b\x00c\x80d',
954 'a+-b\x00c\x80d',
955 'a+-b\x00c\x80d',
956 'a+-b\x00c\x80d\u0100',
957 'a+-b\x00c\x80d\u0100e',
958 'a+-b\x00c\x80d\u0100e',
959 'a+-b\x00c\x80d\u0100e',
960 'a+-b\x00c\x80d\u0100e',
961 'a+-b\x00c\x80d\u0100e',
962 'a+-b\x00c\x80d\u0100e',
963 'a+-b\x00c\x80d\u0100e',
964 'a+-b\x00c\x80d\u0100e',
965 'a+-b\x00c\x80d\u0100e\U00010000',
966 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000967 ]
968 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000969
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300970 def test_errors(self):
971 tests = [
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300972 (b'\xffb', '\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300973 (b'a\xffb', 'a\ufffdb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300974 (b'a\xff\xffb', 'a\ufffd\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300975 (b'a+IK', 'a\ufffd'),
976 (b'a+IK-b', 'a\ufffdb'),
977 (b'a+IK,b', 'a\ufffdb'),
978 (b'a+IKx', 'a\u20ac\ufffd'),
979 (b'a+IKx-b', 'a\u20ac\ufffdb'),
980 (b'a+IKwgr', 'a\u20ac\ufffd'),
981 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
982 (b'a+IKwgr,', 'a\u20ac\ufffd'),
983 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
984 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
985 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
986 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
987 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
988 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
989 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300990 (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
991 (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
Zackery Spytze349bf22018-08-18 22:43:38 -0600992 (b'a+@b', 'a\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300993 ]
994 for raw, expected in tests:
995 with self.subTest(raw=raw):
996 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
997 raw, 'strict', True)
998 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
999
1000 def test_nonbmp(self):
1001 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1002 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1003 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001004 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1005 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1006 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1007 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1008 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1009 b'+IKwgrNgB3KA-')
1010 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1011 '\u20ac\u20ac\U000104A0')
1012 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1013 '\u20ac\u20ac\U000104A0')
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001014
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001015 def test_lone_surrogates(self):
1016 tests = [
1017 (b'a+2AE-b', 'a\ud801b'),
1018 (b'a+2AE\xffb', 'a\ufffdb'),
1019 (b'a+2AE', 'a\ufffd'),
1020 (b'a+2AEA-b', 'a\ufffdb'),
1021 (b'a+2AH-b', 'a\ufffdb'),
1022 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1023 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1024 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1025 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1026 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1027 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1028 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1029 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1030 ]
1031 for raw, expected in tests:
1032 with self.subTest(raw=raw):
1033 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001034
1035
Walter Dörwalde22d3392005-11-17 08:52:34 +00001036class UTF16ExTest(unittest.TestCase):
1037
1038 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001039 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001040
1041 def test_bad_args(self):
1042 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1043
1044class ReadBufferTest(unittest.TestCase):
1045
1046 def test_array(self):
1047 import array
1048 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001049 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +00001050 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001051 )
1052
1053 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +00001054 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +00001055
1056 def test_bad_args(self):
1057 self.assertRaises(TypeError, codecs.readbuffer_encode)
1058 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1059
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001060class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001061 encoding = "utf-8-sig"
Victor Stinner01ada392015-10-01 21:54:51 +02001062 BOM = codecs.BOM_UTF8
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001063
1064 def test_partial(self):
1065 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001066 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001067 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001068 "",
1069 "",
1070 "", # First BOM has been read and skipped
1071 "",
1072 "",
1073 "\ufeff", # Second BOM has been read and emitted
1074 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001075 "\ufeff\x00", # First byte of encoded "\xff" read
1076 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1077 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1078 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001079 "\ufeff\x00\xff\u07ff",
1080 "\ufeff\x00\xff\u07ff",
1081 "\ufeff\x00\xff\u07ff\u0800",
1082 "\ufeff\x00\xff\u07ff\u0800",
1083 "\ufeff\x00\xff\u07ff\u0800",
1084 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001085 "\ufeff\x00\xff\u07ff\u0800\uffff",
1086 "\ufeff\x00\xff\u07ff\u0800\uffff",
1087 "\ufeff\x00\xff\u07ff\u0800\uffff",
1088 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001089 ]
1090 )
1091
Thomas Wouters89f507f2006-12-13 04:49:30 +00001092 def test_bug1601501(self):
1093 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001094 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001095
Walter Dörwald3abcb012007-04-16 22:10:50 +00001096 def test_bom(self):
1097 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001098 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001099 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1100
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001101 def test_stream_bom(self):
1102 unistring = "ABC\u00A1\u2200XYZ"
1103 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1104
1105 reader = codecs.getreader("utf-8-sig")
1106 for sizehint in [None] + list(range(1, 11)) + \
1107 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001108 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001109 ostream = io.StringIO()
1110 while 1:
1111 if sizehint is not None:
1112 data = istream.read(sizehint)
1113 else:
1114 data = istream.read()
1115
1116 if not data:
1117 break
1118 ostream.write(data)
1119
1120 got = ostream.getvalue()
1121 self.assertEqual(got, unistring)
1122
1123 def test_stream_bare(self):
1124 unistring = "ABC\u00A1\u2200XYZ"
1125 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1126
1127 reader = codecs.getreader("utf-8-sig")
1128 for sizehint in [None] + list(range(1, 11)) + \
1129 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001130 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001131 ostream = io.StringIO()
1132 while 1:
1133 if sizehint is not None:
1134 data = istream.read(sizehint)
1135 else:
1136 data = istream.read()
1137
1138 if not data:
1139 break
1140 ostream.write(data)
1141
1142 got = ostream.getvalue()
1143 self.assertEqual(got, unistring)
1144
Chris A2565ede2020-03-02 01:39:50 -05001145
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001146class EscapeDecodeTest(unittest.TestCase):
1147 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001148 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001149 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001150
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001151 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001152 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001153 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001154 b = bytes([b])
1155 if b != b'\\':
1156 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001157
1158 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001159 decode = codecs.escape_decode
1160 check = coding_checker(self, decode)
1161 check(b"[\\\n]", b"[]")
1162 check(br'[\"]', b'["]')
1163 check(br"[\']", b"[']")
R David Murray110b6fe2016-09-08 15:34:08 -04001164 check(br"[\\]", b"[\\]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001165 check(br"[\a]", b"[\x07]")
1166 check(br"[\b]", b"[\x08]")
1167 check(br"[\t]", b"[\x09]")
1168 check(br"[\n]", b"[\x0a]")
1169 check(br"[\v]", b"[\x0b]")
1170 check(br"[\f]", b"[\x0c]")
1171 check(br"[\r]", b"[\x0d]")
1172 check(br"[\7]", b"[\x07]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001173 check(br"[\78]", b"[\x078]")
1174 check(br"[\41]", b"[!]")
1175 check(br"[\418]", b"[!8]")
1176 check(br"[\101]", b"[A]")
1177 check(br"[\1010]", b"[A0]")
1178 check(br"[\501]", b"[A]")
1179 check(br"[\x41]", b"[A]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001180 check(br"[\x410]", b"[A0]")
R David Murray110b6fe2016-09-08 15:34:08 -04001181 for i in range(97, 123):
1182 b = bytes([i])
1183 if b not in b'abfnrtvx':
1184 with self.assertWarns(DeprecationWarning):
1185 check(b"\\" + b, b"\\" + b)
1186 with self.assertWarns(DeprecationWarning):
1187 check(b"\\" + b.upper(), b"\\" + b.upper())
1188 with self.assertWarns(DeprecationWarning):
1189 check(br"\8", b"\\8")
1190 with self.assertWarns(DeprecationWarning):
1191 check(br"\9", b"\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03001192 with self.assertWarns(DeprecationWarning):
1193 check(b"\\\xfa", b"\\\xfa")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001194
1195 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001196 decode = codecs.escape_decode
1197 self.assertRaises(ValueError, decode, br"\x")
1198 self.assertRaises(ValueError, decode, br"[\x]")
1199 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1200 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1201 self.assertRaises(ValueError, decode, br"\x0")
1202 self.assertRaises(ValueError, decode, br"[\x0]")
1203 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1204 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001205
Victor Stinnerf96418d2015-09-21 23:06:27 +02001206
Martin v. Löwis2548c732003-04-18 10:39:54 +00001207# From RFC 3492
1208punycode_testcases = [
1209 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001210 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1211 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001212 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001213 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001214 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001215 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001216 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001217 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001218 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001219 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001220 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1221 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1222 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001223 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001224 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001225 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1226 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1227 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001228 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001229 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001230 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001231 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1232 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1233 "\u0939\u0948\u0902",
1234 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001235
1236 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001237 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001238 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1239 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001240
1241 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001242 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1243 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1244 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001245 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1246 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001247
1248 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001249 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1250 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1251 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1252 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001253 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001254
1255 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001256 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1257 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1258 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1259 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1260 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001261 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001262
1263 # (K) Vietnamese:
1264 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1265 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001266 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1267 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1268 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1269 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001270 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001271
Martin v. Löwis2548c732003-04-18 10:39:54 +00001272 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001273 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001274 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001275
Martin v. Löwis2548c732003-04-18 10:39:54 +00001276 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001277 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1278 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1279 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001280 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001281
1282 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001283 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1284 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1285 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001286 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001287
1288 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001289 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001290 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001291
1292 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001293 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1294 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001295 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001296
1297 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001298 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001299 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001300
1301 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001302 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001303 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001304
1305 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001306 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1307 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001308 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001309 ]
1310
1311for i in punycode_testcases:
1312 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001313 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001314
Victor Stinnerf96418d2015-09-21 23:06:27 +02001315
Martin v. Löwis2548c732003-04-18 10:39:54 +00001316class PunycodeTest(unittest.TestCase):
1317 def test_encode(self):
1318 for uni, puny in punycode_testcases:
1319 # Need to convert both strings to lower case, since
1320 # some of the extended encodings use upper case, but our
1321 # code produces only lower case. Converting just puny to
1322 # lower is also insufficient, since some of the input characters
1323 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001324 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001325 str(uni.encode("punycode"), "ascii").lower(),
1326 str(puny, "ascii").lower()
1327 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001328
1329 def test_decode(self):
1330 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001331 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001332 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001333 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001334
Berker Peksagba22e8f2020-02-25 06:19:03 +03001335 def test_decode_invalid(self):
1336 testcases = [
1337 (b"xn--w&", "strict", UnicodeError()),
1338 (b"xn--w&", "ignore", "xn-"),
1339 ]
1340 for puny, errors, expected in testcases:
1341 with self.subTest(puny=puny, errors=errors):
1342 if isinstance(expected, Exception):
1343 self.assertRaises(UnicodeError, puny.decode, "punycode", errors)
1344 else:
1345 self.assertEqual(puny.decode("punycode", errors), expected)
1346
Victor Stinnerf96418d2015-09-21 23:06:27 +02001347
Martin v. Löwis2548c732003-04-18 10:39:54 +00001348# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1349nameprep_tests = [
1350 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001351 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1352 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1353 b'\xb8\x8f\xef\xbb\xbf',
1354 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001355 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001356 (b'CAFE',
1357 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001358 # 3.3 Case folding 8bit U+00DF (german sharp s).
1359 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001360 (b'\xc3\x9f',
1361 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001362 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001363 (b'\xc4\xb0',
1364 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001365 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001366 (b'\xc5\x83\xcd\xba',
1367 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001368 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1369 # XXX: skip this as it fails in UCS-2 mode
1370 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1371 # 'telc\xe2\x88\x95kg\xcf\x83'),
1372 (None, None),
1373 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001374 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1375 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001376 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001377 (b'\xe1\xbe\xb7',
1378 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001379 # 3.9 Self-reverting case folding U+01F0 and normalization.
1380 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001381 (b'\xc7\xb0',
1382 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001383 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001384 (b'\xce\x90',
1385 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001386 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001387 (b'\xce\xb0',
1388 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001389 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001390 (b'\xe1\xba\x96',
1391 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001392 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001393 (b'\xe1\xbd\x96',
1394 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001395 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001396 (b' ',
1397 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001398 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001399 (b'\xc2\xa0',
1400 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001401 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001402 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001403 None),
1404 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001405 (b'\xe2\x80\x80',
1406 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001407 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001408 (b'\xe2\x80\x8b',
1409 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001410 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001411 (b'\xe3\x80\x80',
1412 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001413 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001414 (b'\x10\x7f',
1415 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001416 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001417 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001418 None),
1419 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001420 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001421 None),
1422 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001423 (b'\xef\xbb\xbf',
1424 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001425 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001426 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001427 None),
1428 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001429 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001430 None),
1431 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001432 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001433 None),
1434 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001435 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001436 None),
1437 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001438 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001439 None),
1440 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001441 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001442 None),
1443 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001444 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001445 None),
1446 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001447 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001448 None),
1449 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001450 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001451 None),
1452 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001453 (b'\xcd\x81',
1454 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001455 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001456 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001457 None),
1458 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001459 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001460 None),
1461 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001462 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001463 None),
1464 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001465 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001466 None),
1467 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001468 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001469 None),
1470 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001471 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001472 None),
1473 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001474 (b'foo\xef\xb9\xb6bar',
1475 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001476 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001477 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001478 None),
1479 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001480 (b'\xd8\xa71\xd8\xa8',
1481 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001482 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001483 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001484 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001485 # None),
1486 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001487 # 3.44 Larger test (shrinking).
1488 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001489 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1490 b'\xaa\xce\xb0\xe2\x80\x80',
1491 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001492 # 3.45 Larger test (expanding).
1493 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001494 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1495 b'\x80',
1496 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1497 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1498 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001499 ]
1500
1501
1502class NameprepTest(unittest.TestCase):
1503 def test_nameprep(self):
1504 from encodings.idna import nameprep
1505 for pos, (orig, prepped) in enumerate(nameprep_tests):
1506 if orig is None:
1507 # Skipped
1508 continue
1509 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001510 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001511 if prepped is None:
1512 # Input contains prohibited characters
1513 self.assertRaises(UnicodeError, nameprep, orig)
1514 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001515 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001516 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001517 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001518 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001519 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001520
Victor Stinnerf96418d2015-09-21 23:06:27 +02001521
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001522class IDNACodecTest(unittest.TestCase):
1523 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001524 self.assertEqual(str(b"python.org", "idna"), "python.org")
1525 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1526 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1527 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001528
1529 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001530 self.assertEqual("python.org".encode("idna"), b"python.org")
1531 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1532 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1533 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001534
Martin v. Löwis8b595142005-08-25 11:03:38 +00001535 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001536 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001537 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001538 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001539
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001540 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001541 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001542 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001543 "python.org"
1544 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001545 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001546 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001547 "python.org."
1548 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001549 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001550 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001551 "pyth\xf6n.org."
1552 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001553 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001554 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001555 "pyth\xf6n.org."
1556 )
1557
1558 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001559 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1560 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1561 self.assertEqual(decoder.decode(b"rg"), "")
1562 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001563
1564 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001565 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1566 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1567 self.assertEqual(decoder.decode(b"rg."), "org.")
1568 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001569
1570 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001571 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001572 b"".join(codecs.iterencode("python.org", "idna")),
1573 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001574 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001575 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001576 b"".join(codecs.iterencode("python.org.", "idna")),
1577 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001578 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001579 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001580 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1581 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001582 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001583 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001584 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1585 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001586 )
1587
1588 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001589 self.assertEqual(encoder.encode("\xe4x"), b"")
1590 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1591 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001592
1593 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001594 self.assertEqual(encoder.encode("\xe4x"), b"")
1595 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1596 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001597
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001598 def test_errors(self):
1599 """Only supports "strict" error handler"""
1600 "python.org".encode("idna", "strict")
1601 b"python.org".decode("idna", "strict")
1602 for errors in ("ignore", "replace", "backslashreplace",
1603 "surrogateescape"):
1604 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1605 self.assertRaises(Exception,
1606 b"python.org".decode, "idna", errors)
1607
Victor Stinnerf96418d2015-09-21 23:06:27 +02001608
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001609class CodecsModuleTest(unittest.TestCase):
1610
1611 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001612 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1613 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001614 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001615 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001616 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001617
Victor Stinnera57dfd02014-05-14 17:13:14 +02001618 # test keywords
1619 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1620 '\xe4\xf6\xfc')
1621 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1622 '[]')
1623
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001624 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001625 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1626 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001627 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001628 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001629 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001630 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001631
Victor Stinnera57dfd02014-05-14 17:13:14 +02001632 # test keywords
1633 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1634 b'\xe4\xf6\xfc')
1635 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1636 b'[]')
1637
Walter Dörwald063e1e82004-10-28 13:04:26 +00001638 def test_register(self):
1639 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001640 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001641
1642 def test_lookup(self):
1643 self.assertRaises(TypeError, codecs.lookup)
1644 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001645 self.assertRaises(LookupError, codecs.lookup, " ")
1646
1647 def test_getencoder(self):
1648 self.assertRaises(TypeError, codecs.getencoder)
1649 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1650
1651 def test_getdecoder(self):
1652 self.assertRaises(TypeError, codecs.getdecoder)
1653 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1654
1655 def test_getreader(self):
1656 self.assertRaises(TypeError, codecs.getreader)
1657 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1658
1659 def test_getwriter(self):
1660 self.assertRaises(TypeError, codecs.getwriter)
1661 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001662
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001663 def test_lookup_issue1813(self):
1664 # Issue #1813: under Turkish locales, lookup of some codecs failed
1665 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001666 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001667 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1668 try:
1669 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1670 except locale.Error:
1671 # Unsupported locale on this system
1672 self.skipTest('test needs Turkish locale')
1673 c = codecs.lookup('ASCII')
1674 self.assertEqual(c.name, 'ascii')
1675
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001676 def test_all(self):
1677 api = (
1678 "encode", "decode",
1679 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1680 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1681 "getencoder", "getdecoder", "getincrementalencoder",
1682 "getincrementaldecoder", "getreader", "getwriter",
1683 "register_error", "lookup_error",
1684 "strict_errors", "replace_errors", "ignore_errors",
1685 "xmlcharrefreplace_errors", "backslashreplace_errors",
1686 "namereplace_errors",
1687 "open", "EncodedFile",
1688 "iterencode", "iterdecode",
1689 "BOM", "BOM_BE", "BOM_LE",
1690 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1691 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1692 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1693 "StreamReaderWriter", "StreamRecoder",
1694 )
1695 self.assertCountEqual(api, codecs.__all__)
1696 for api in codecs.__all__:
1697 getattr(codecs, api)
1698
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001699 def test_open(self):
1700 self.addCleanup(support.unlink, support.TESTFN)
1701 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1702 with self.subTest(mode), \
1703 codecs.open(support.TESTFN, mode, 'ascii') as file:
1704 self.assertIsInstance(file, codecs.StreamReaderWriter)
1705
1706 def test_undefined(self):
1707 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1708 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1709 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1710 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1711 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1712 self.assertRaises(UnicodeError,
1713 codecs.encode, 'abc', 'undefined', errors)
1714 self.assertRaises(UnicodeError,
1715 codecs.decode, b'abc', 'undefined', errors)
1716
Chris A2565ede2020-03-02 01:39:50 -05001717 def test_file_closes_if_lookup_error_raised(self):
1718 mock_open = mock.mock_open()
1719 with mock.patch('builtins.open', mock_open) as file:
1720 with self.assertRaises(LookupError):
1721 codecs.open(support.TESTFN, 'wt', 'invalid-encoding')
1722
1723 file().close.assert_called()
1724
Victor Stinnerf96418d2015-09-21 23:06:27 +02001725
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001726class StreamReaderTest(unittest.TestCase):
1727
1728 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001729 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001730 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001731
1732 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001733 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001734 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001735
Victor Stinnerf96418d2015-09-21 23:06:27 +02001736
Thomas Wouters89f507f2006-12-13 04:49:30 +00001737class EncodedFileTest(unittest.TestCase):
1738
1739 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001740 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001741 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001742 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001743
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001744 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001745 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001746 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001747 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001748
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001749all_unicode_encodings = [
1750 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001751 "big5",
1752 "big5hkscs",
1753 "charmap",
1754 "cp037",
1755 "cp1006",
1756 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001757 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001758 "cp1140",
1759 "cp1250",
1760 "cp1251",
1761 "cp1252",
1762 "cp1253",
1763 "cp1254",
1764 "cp1255",
1765 "cp1256",
1766 "cp1257",
1767 "cp1258",
1768 "cp424",
1769 "cp437",
1770 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001771 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001772 "cp737",
1773 "cp775",
1774 "cp850",
1775 "cp852",
1776 "cp855",
1777 "cp856",
1778 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001779 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001780 "cp860",
1781 "cp861",
1782 "cp862",
1783 "cp863",
1784 "cp864",
1785 "cp865",
1786 "cp866",
1787 "cp869",
1788 "cp874",
1789 "cp875",
1790 "cp932",
1791 "cp949",
1792 "cp950",
1793 "euc_jis_2004",
1794 "euc_jisx0213",
1795 "euc_jp",
1796 "euc_kr",
1797 "gb18030",
1798 "gb2312",
1799 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001800 "hp_roman8",
1801 "hz",
1802 "idna",
1803 "iso2022_jp",
1804 "iso2022_jp_1",
1805 "iso2022_jp_2",
1806 "iso2022_jp_2004",
1807 "iso2022_jp_3",
1808 "iso2022_jp_ext",
1809 "iso2022_kr",
1810 "iso8859_1",
1811 "iso8859_10",
1812 "iso8859_11",
1813 "iso8859_13",
1814 "iso8859_14",
1815 "iso8859_15",
1816 "iso8859_16",
1817 "iso8859_2",
1818 "iso8859_3",
1819 "iso8859_4",
1820 "iso8859_5",
1821 "iso8859_6",
1822 "iso8859_7",
1823 "iso8859_8",
1824 "iso8859_9",
1825 "johab",
1826 "koi8_r",
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03001827 "koi8_t",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001828 "koi8_u",
Serhiy Storchakaad8a1c32015-05-12 23:16:55 +03001829 "kz1048",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001830 "latin_1",
1831 "mac_cyrillic",
1832 "mac_greek",
1833 "mac_iceland",
1834 "mac_latin2",
1835 "mac_roman",
1836 "mac_turkish",
1837 "palmos",
1838 "ptcp154",
1839 "punycode",
1840 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001841 "shift_jis",
1842 "shift_jis_2004",
1843 "shift_jisx0213",
1844 "tis_620",
1845 "unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001846 "utf_16",
1847 "utf_16_be",
1848 "utf_16_le",
1849 "utf_7",
1850 "utf_8",
1851]
1852
1853if hasattr(codecs, "mbcs_encode"):
1854 all_unicode_encodings.append("mbcs")
Steve Dowerf5aba582016-09-06 19:42:27 -07001855if hasattr(codecs, "oem_encode"):
1856 all_unicode_encodings.append("oem")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001857
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001858# The following encoding is not tested, because it's not supposed
1859# to work:
1860# "undefined"
1861
1862# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001863broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001864 "punycode",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001865]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001866
Victor Stinnerf96418d2015-09-21 23:06:27 +02001867
Walter Dörwald3abcb012007-04-16 22:10:50 +00001868class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001869 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001870 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001871 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001872 name = codecs.lookup(encoding).name
1873 if encoding.endswith("_codec"):
1874 name += "_codec"
1875 elif encoding == "latin_1":
1876 name = "latin_1"
1877 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001878
Inada Naoki6a16b182019-03-18 15:44:11 +09001879 (b, size) = codecs.getencoder(encoding)(s)
1880 self.assertEqual(size, len(s), "encoding=%r" % encoding)
1881 (chars, size) = codecs.getdecoder(encoding)(b)
1882 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001883
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001884 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001885 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001886 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001887 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001888 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001889 for c in s:
1890 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001891 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001892 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001893 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001894 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001895 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001896 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001897 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001898 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001899 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001900 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001901
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001902 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001903 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001904 try:
1905 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001906 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001907 pass
1908 else:
1909 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001910 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001911 for c in s:
1912 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001913 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001914 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001915 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001916 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001917 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001918 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001919 self.assertEqual(decodedresult, s,
1920 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001921
1922 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001923 result = "".join(codecs.iterdecode(
1924 codecs.iterencode(s, encoding), encoding))
1925 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001926
1927 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001928 result = "".join(codecs.iterdecode(
1929 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001930 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001931
Victor Stinner554f3f02010-06-16 23:33:54 +00001932 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001933 # check incremental decoder/encoder with errors argument
1934 try:
1935 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001936 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001937 pass
1938 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001939 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001940 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001941 decodedresult = "".join(decoder.decode(bytes([c]))
1942 for c in encodedresult)
1943 self.assertEqual(decodedresult, s,
1944 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001945
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001946 @support.cpython_only
1947 def test_basics_capi(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001948 s = "abc123" # all codecs should be able to encode these
1949 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001950 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001951 # check incremental decoder/encoder (fetched via the C API)
1952 try:
Victor Stinner3d4226a2018-08-29 22:21:32 +02001953 cencoder = _testcapi.codec_incrementalencoder(encoding)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001954 except LookupError: # no IncrementalEncoder
1955 pass
1956 else:
1957 # check C API
1958 encodedresult = b""
1959 for c in s:
1960 encodedresult += cencoder.encode(c)
1961 encodedresult += cencoder.encode("", True)
Victor Stinner3d4226a2018-08-29 22:21:32 +02001962 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001963 decodedresult = ""
1964 for c in encodedresult:
1965 decodedresult += cdecoder.decode(bytes([c]))
1966 decodedresult += cdecoder.decode(b"", True)
1967 self.assertEqual(decodedresult, s,
1968 "encoding=%r" % encoding)
1969
1970 if encoding not in ("idna", "mbcs"):
1971 # check incremental decoder/encoder with errors argument
1972 try:
Victor Stinner3d4226a2018-08-29 22:21:32 +02001973 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001974 except LookupError: # no IncrementalEncoder
1975 pass
1976 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001977 encodedresult = b"".join(cencoder.encode(c) for c in s)
Victor Stinner3d4226a2018-08-29 22:21:32 +02001978 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001979 decodedresult = "".join(cdecoder.decode(bytes([c]))
1980 for c in encodedresult)
1981 self.assertEqual(decodedresult, s,
1982 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001983
Walter Dörwald729c31f2005-03-14 19:06:30 +00001984 def test_seek(self):
1985 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001986 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001987 for encoding in all_unicode_encodings:
1988 if encoding == "idna": # FIXME: See SF bug #1163178
1989 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001990 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00001991 continue
Victor Stinner05010702011-05-27 16:50:40 +02001992 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001993 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001994 # Test that calling seek resets the internal codec state and buffers
1995 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001996 data = reader.read()
1997 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001998
Walter Dörwalde22d3392005-11-17 08:52:34 +00001999 def test_bad_decode_args(self):
2000 for encoding in all_unicode_encodings:
2001 decoder = codecs.getdecoder(encoding)
2002 self.assertRaises(TypeError, decoder)
2003 if encoding not in ("idna", "punycode"):
2004 self.assertRaises(TypeError, decoder, 42)
2005
2006 def test_bad_encode_args(self):
2007 for encoding in all_unicode_encodings:
2008 encoder = codecs.getencoder(encoding)
Inada Naoki6a16b182019-03-18 15:44:11 +09002009 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002010
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002011 def test_encoding_map_type_initialized(self):
2012 from encodings import cp1140
2013 # This used to crash, we are only verifying there's no crash.
2014 table_type = type(cp1140.encoding_table)
2015 self.assertEqual(table_type, table_type)
2016
Walter Dörwald3abcb012007-04-16 22:10:50 +00002017 def test_decoder_state(self):
2018 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002019 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002020 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002021 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002022 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2023 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2024
Victor Stinnerf96418d2015-09-21 23:06:27 +02002025
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002026class CharmapTest(unittest.TestCase):
2027 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002028 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002029 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002030 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002031 )
2032
Ezio Melottib3aedd42010-11-20 19:04:17 +00002033 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002034 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2035 ("\U0010FFFFbc", 3)
2036 )
2037
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002038 self.assertRaises(UnicodeDecodeError,
2039 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2040 )
2041
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002042 self.assertRaises(UnicodeDecodeError,
2043 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2044 )
2045
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002046 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002047 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002048 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002049 )
2050
Ezio Melottib3aedd42010-11-20 19:04:17 +00002051 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002052 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002053 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002054 )
2055
Ezio Melottib3aedd42010-11-20 19:04:17 +00002056 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002057 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2058 ("ab\\x02", 3)
2059 )
2060
2061 self.assertEqual(
2062 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2063 ("ab\\x02", 3)
2064 )
2065
2066 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002067 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002068 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002069 )
2070
Ezio Melottib3aedd42010-11-20 19:04:17 +00002071 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002072 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002073 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002074 )
2075
Guido van Rossum805365e2007-05-07 22:24:25 +00002076 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002077 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002078 codecs.charmap_decode(allbytes, "ignore", ""),
2079 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002080 )
2081
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002082 def test_decode_with_int2str_map(self):
2083 self.assertEqual(
2084 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2085 {0: 'a', 1: 'b', 2: 'c'}),
2086 ("abc", 3)
2087 )
2088
2089 self.assertEqual(
2090 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2091 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2092 ("AaBbCc", 3)
2093 )
2094
2095 self.assertEqual(
2096 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2097 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2098 ("\U0010FFFFbc", 3)
2099 )
2100
2101 self.assertEqual(
2102 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2103 {0: 'a', 1: 'b', 2: ''}),
2104 ("ab", 3)
2105 )
2106
2107 self.assertRaises(UnicodeDecodeError,
2108 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2109 {0: 'a', 1: 'b'}
2110 )
2111
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002112 self.assertRaises(UnicodeDecodeError,
2113 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2114 {0: 'a', 1: 'b', 2: None}
2115 )
2116
2117 # Issue #14850
2118 self.assertRaises(UnicodeDecodeError,
2119 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2120 {0: 'a', 1: 'b', 2: '\ufffe'}
2121 )
2122
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002123 self.assertEqual(
2124 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2125 {0: 'a', 1: 'b'}),
2126 ("ab\ufffd", 3)
2127 )
2128
2129 self.assertEqual(
2130 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2131 {0: 'a', 1: 'b', 2: None}),
2132 ("ab\ufffd", 3)
2133 )
2134
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002135 # Issue #14850
2136 self.assertEqual(
2137 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2138 {0: 'a', 1: 'b', 2: '\ufffe'}),
2139 ("ab\ufffd", 3)
2140 )
2141
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002142 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002143 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2144 {0: 'a', 1: 'b'}),
2145 ("ab\\x02", 3)
2146 )
2147
2148 self.assertEqual(
2149 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2150 {0: 'a', 1: 'b', 2: None}),
2151 ("ab\\x02", 3)
2152 )
2153
2154 # Issue #14850
2155 self.assertEqual(
2156 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2157 {0: 'a', 1: 'b', 2: '\ufffe'}),
2158 ("ab\\x02", 3)
2159 )
2160
2161 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002162 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2163 {0: 'a', 1: 'b'}),
2164 ("ab", 3)
2165 )
2166
2167 self.assertEqual(
2168 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2169 {0: 'a', 1: 'b', 2: None}),
2170 ("ab", 3)
2171 )
2172
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002173 # Issue #14850
2174 self.assertEqual(
2175 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2176 {0: 'a', 1: 'b', 2: '\ufffe'}),
2177 ("ab", 3)
2178 )
2179
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002180 allbytes = bytes(range(256))
2181 self.assertEqual(
2182 codecs.charmap_decode(allbytes, "ignore", {}),
2183 ("", len(allbytes))
2184 )
2185
Miss Skeleton (bot)6a2aa492020-10-17 23:00:18 -07002186 self.assertRaisesRegex(TypeError,
2187 "character mapping must be in range\\(0x110000\\)",
2188 codecs.charmap_decode,
2189 b"\x00\x01\x02", "strict", {0: "A", 1: 'Bb', 2: -2}
2190 )
2191
2192 self.assertRaisesRegex(TypeError,
2193 "character mapping must be in range\\(0x110000\\)",
2194 codecs.charmap_decode,
2195 b"\x00\x01\x02", "strict", {0: "A", 1: 'Bb', 2: 999999999}
2196 )
2197
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002198 def test_decode_with_int2int_map(self):
2199 a = ord('a')
2200 b = ord('b')
2201 c = ord('c')
2202
2203 self.assertEqual(
2204 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2205 {0: a, 1: b, 2: c}),
2206 ("abc", 3)
2207 )
2208
2209 # Issue #15379
2210 self.assertEqual(
2211 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2212 {0: 0x10FFFF, 1: b, 2: c}),
2213 ("\U0010FFFFbc", 3)
2214 )
2215
Antoine Pitroua1f76552012-09-23 20:00:04 +02002216 self.assertEqual(
2217 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2218 {0: sys.maxunicode, 1: b, 2: c}),
2219 (chr(sys.maxunicode) + "bc", 3)
2220 )
2221
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002222 self.assertRaises(TypeError,
2223 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002224 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002225 )
2226
2227 self.assertRaises(UnicodeDecodeError,
2228 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2229 {0: a, 1: b},
2230 )
2231
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002232 self.assertRaises(UnicodeDecodeError,
2233 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2234 {0: a, 1: b, 2: 0xFFFE},
2235 )
2236
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002237 self.assertEqual(
2238 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2239 {0: a, 1: b}),
2240 ("ab\ufffd", 3)
2241 )
2242
2243 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002244 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2245 {0: a, 1: b, 2: 0xFFFE}),
2246 ("ab\ufffd", 3)
2247 )
2248
2249 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002250 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2251 {0: a, 1: b}),
2252 ("ab\\x02", 3)
2253 )
2254
2255 self.assertEqual(
2256 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2257 {0: a, 1: b, 2: 0xFFFE}),
2258 ("ab\\x02", 3)
2259 )
2260
2261 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002262 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2263 {0: a, 1: b}),
2264 ("ab", 3)
2265 )
2266
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002267 self.assertEqual(
2268 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2269 {0: a, 1: b, 2: 0xFFFE}),
2270 ("ab", 3)
2271 )
2272
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002273
Thomas Wouters89f507f2006-12-13 04:49:30 +00002274class WithStmtTest(unittest.TestCase):
2275 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002276 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002277 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2278 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002279 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002280
2281 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002282 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002283 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002284 with codecs.StreamReaderWriter(f, info.streamreader,
2285 info.streamwriter, 'strict') as srw:
2286 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002287
Victor Stinnerf96418d2015-09-21 23:06:27 +02002288
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002289class TypesTest(unittest.TestCase):
2290 def test_decode_unicode(self):
2291 # Most decoders don't accept unicode input
2292 decoders = [
2293 codecs.utf_7_decode,
2294 codecs.utf_8_decode,
2295 codecs.utf_16_le_decode,
2296 codecs.utf_16_be_decode,
2297 codecs.utf_16_ex_decode,
2298 codecs.utf_32_decode,
2299 codecs.utf_32_le_decode,
2300 codecs.utf_32_be_decode,
2301 codecs.utf_32_ex_decode,
2302 codecs.latin_1_decode,
2303 codecs.ascii_decode,
2304 codecs.charmap_decode,
2305 ]
2306 if hasattr(codecs, "mbcs_decode"):
2307 decoders.append(codecs.mbcs_decode)
2308 for decoder in decoders:
2309 self.assertRaises(TypeError, decoder, "xxx")
2310
2311 def test_unicode_escape(self):
Martin Panter119e5022016-04-16 09:28:57 +00002312 # Escape-decoding a unicode string is supported and gives the same
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002313 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002314 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2315 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2316 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2317 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002318
Victor Stinnere3b47152011-12-09 20:49:49 +01002319 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2320 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002321 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2322 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002323
2324 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2325 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002326 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2327 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002328
Serhiy Storchakad6793772013-01-29 10:20:44 +02002329
2330class UnicodeEscapeTest(unittest.TestCase):
2331 def test_empty(self):
2332 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2333 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2334
2335 def test_raw_encode(self):
2336 encode = codecs.unicode_escape_encode
2337 for b in range(32, 127):
2338 if b != b'\\'[0]:
2339 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2340
2341 def test_raw_decode(self):
2342 decode = codecs.unicode_escape_decode
2343 for b in range(256):
2344 if b != b'\\'[0]:
2345 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2346
2347 def test_escape_encode(self):
2348 encode = codecs.unicode_escape_encode
2349 check = coding_checker(self, encode)
2350 check('\t', br'\t')
2351 check('\n', br'\n')
2352 check('\r', br'\r')
2353 check('\\', br'\\')
2354 for b in range(32):
2355 if chr(b) not in '\t\n\r':
2356 check(chr(b), ('\\x%02x' % b).encode())
2357 for b in range(127, 256):
2358 check(chr(b), ('\\x%02x' % b).encode())
2359 check('\u20ac', br'\u20ac')
2360 check('\U0001d120', br'\U0001d120')
2361
2362 def test_escape_decode(self):
2363 decode = codecs.unicode_escape_decode
2364 check = coding_checker(self, decode)
2365 check(b"[\\\n]", "[]")
2366 check(br'[\"]', '["]')
2367 check(br"[\']", "[']")
2368 check(br"[\\]", r"[\]")
2369 check(br"[\a]", "[\x07]")
2370 check(br"[\b]", "[\x08]")
2371 check(br"[\t]", "[\x09]")
2372 check(br"[\n]", "[\x0a]")
2373 check(br"[\v]", "[\x0b]")
2374 check(br"[\f]", "[\x0c]")
2375 check(br"[\r]", "[\x0d]")
2376 check(br"[\7]", "[\x07]")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002377 check(br"[\78]", "[\x078]")
2378 check(br"[\41]", "[!]")
2379 check(br"[\418]", "[!8]")
2380 check(br"[\101]", "[A]")
2381 check(br"[\1010]", "[A0]")
2382 check(br"[\x41]", "[A]")
2383 check(br"[\x410]", "[A0]")
2384 check(br"\u20ac", "\u20ac")
2385 check(br"\U0001d120", "\U0001d120")
R David Murray110b6fe2016-09-08 15:34:08 -04002386 for i in range(97, 123):
2387 b = bytes([i])
2388 if b not in b'abfnrtuvx':
2389 with self.assertWarns(DeprecationWarning):
2390 check(b"\\" + b, "\\" + chr(i))
2391 if b.upper() not in b'UN':
2392 with self.assertWarns(DeprecationWarning):
2393 check(b"\\" + b.upper(), "\\" + chr(i-32))
2394 with self.assertWarns(DeprecationWarning):
2395 check(br"\8", "\\8")
2396 with self.assertWarns(DeprecationWarning):
2397 check(br"\9", "\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03002398 with self.assertWarns(DeprecationWarning):
2399 check(b"\\\xfa", "\\\xfa")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002400
2401 def test_decode_errors(self):
2402 decode = codecs.unicode_escape_decode
2403 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2404 for i in range(d):
2405 self.assertRaises(UnicodeDecodeError, decode,
2406 b"\\" + c + b"0"*i)
2407 self.assertRaises(UnicodeDecodeError, decode,
2408 b"[\\" + c + b"0"*i + b"]")
2409 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2410 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2411 self.assertEqual(decode(data, "replace"),
2412 ("[\ufffd]\ufffd", len(data)))
2413 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2414 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2415 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2416
2417
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002418class RawUnicodeEscapeTest(unittest.TestCase):
2419 def test_empty(self):
2420 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2421 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2422
2423 def test_raw_encode(self):
2424 encode = codecs.raw_unicode_escape_encode
2425 for b in range(256):
2426 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2427
2428 def test_raw_decode(self):
2429 decode = codecs.raw_unicode_escape_decode
2430 for b in range(256):
2431 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2432
2433 def test_escape_encode(self):
2434 encode = codecs.raw_unicode_escape_encode
2435 check = coding_checker(self, encode)
2436 for b in range(256):
2437 if b not in b'uU':
2438 check('\\' + chr(b), b'\\' + bytes([b]))
2439 check('\u20ac', br'\u20ac')
2440 check('\U0001d120', br'\U0001d120')
2441
2442 def test_escape_decode(self):
2443 decode = codecs.raw_unicode_escape_decode
2444 check = coding_checker(self, decode)
2445 for b in range(256):
2446 if b not in b'uU':
2447 check(b'\\' + bytes([b]), '\\' + chr(b))
2448 check(br"\u20ac", "\u20ac")
2449 check(br"\U0001d120", "\U0001d120")
2450
2451 def test_decode_errors(self):
2452 decode = codecs.raw_unicode_escape_decode
2453 for c, d in (b'u', 4), (b'U', 4):
2454 for i in range(d):
2455 self.assertRaises(UnicodeDecodeError, decode,
2456 b"\\" + c + b"0"*i)
2457 self.assertRaises(UnicodeDecodeError, decode,
2458 b"[\\" + c + b"0"*i + b"]")
2459 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2460 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2461 self.assertEqual(decode(data, "replace"),
2462 ("[\ufffd]\ufffd", len(data)))
2463 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2464 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2465 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2466
2467
Berker Peksag4a72a7b2016-09-16 17:31:06 +03002468class EscapeEncodeTest(unittest.TestCase):
2469
2470 def test_escape_encode(self):
2471 tests = [
2472 (b'', (b'', 0)),
2473 (b'foobar', (b'foobar', 6)),
2474 (b'spam\0eggs', (b'spam\\x00eggs', 9)),
2475 (b'a\'b', (b"a\\'b", 3)),
2476 (b'b\\c', (b'b\\\\c', 3)),
2477 (b'c\nd', (b'c\\nd', 3)),
2478 (b'd\re', (b'd\\re', 3)),
2479 (b'f\x7fg', (b'f\\x7fg', 3)),
2480 ]
2481 for data, output in tests:
2482 with self.subTest(data=data):
2483 self.assertEqual(codecs.escape_encode(data), output)
2484 self.assertRaises(TypeError, codecs.escape_encode, 'spam')
2485 self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam'))
2486
2487
Martin v. Löwis43c57782009-05-10 08:15:24 +00002488class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002489
2490 def test_utf8(self):
2491 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002492 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002493 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002494 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002495 b"foo\x80bar")
2496 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002497 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002498 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002499 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002500 b"\xed\xb0\x80")
2501
2502 def test_ascii(self):
2503 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002504 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002505 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002506 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002507 b"foo\x80bar")
2508
2509 def test_charmap(self):
2510 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002511 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002512 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002513 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002514 b"foo\xa5bar")
2515
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002516 def test_latin1(self):
2517 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002518 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002519 b"\xe4\xeb\xef\xf6\xfc")
2520
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002521
Victor Stinner3fed0872010-05-22 02:16:27 +00002522class BomTest(unittest.TestCase):
2523 def test_seek0(self):
2524 data = "1234567890"
2525 tests = ("utf-16",
2526 "utf-16-le",
2527 "utf-16-be",
2528 "utf-32",
2529 "utf-32-le",
2530 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002531 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002532 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002533 # Check if the BOM is written only once
2534 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002535 f.write(data)
2536 f.write(data)
2537 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002538 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002539 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002540 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002541
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002542 # Check that the BOM is written after a seek(0)
2543 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2544 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002545 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002546 f.seek(0)
2547 f.write(data)
2548 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002549 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002550
2551 # (StreamWriter) Check that the BOM is written after a seek(0)
2552 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002553 f.writer.write(data[0])
2554 self.assertNotEqual(f.writer.tell(), 0)
2555 f.writer.seek(0)
2556 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002557 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002558 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002559
Victor Stinner05010702011-05-27 16:50:40 +02002560 # Check that the BOM is not written after a seek() at a position
2561 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002562 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2563 f.write(data)
2564 f.seek(f.tell())
2565 f.write(data)
2566 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002567 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002568
Victor Stinner05010702011-05-27 16:50:40 +02002569 # (StreamWriter) Check that the BOM is not written after a seek()
2570 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002571 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002572 f.writer.write(data)
2573 f.writer.seek(f.writer.tell())
2574 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002575 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002576 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002577
Victor Stinner3fed0872010-05-22 02:16:27 +00002578
Georg Brandl02524622010-12-02 18:06:51 +00002579bytes_transform_encodings = [
2580 "base64_codec",
2581 "uu_codec",
2582 "quopri_codec",
2583 "hex_codec",
2584]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002585
2586transform_aliases = {
2587 "base64_codec": ["base64", "base_64"],
2588 "uu_codec": ["uu"],
2589 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2590 "hex_codec": ["hex"],
2591 "rot_13": ["rot13"],
2592}
2593
Georg Brandl02524622010-12-02 18:06:51 +00002594try:
2595 import zlib
2596except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002597 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002598else:
2599 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002600 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002601try:
2602 import bz2
2603except ImportError:
2604 pass
2605else:
2606 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002607 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002608
Victor Stinnerf96418d2015-09-21 23:06:27 +02002609
Georg Brandl02524622010-12-02 18:06:51 +00002610class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002611
Georg Brandl02524622010-12-02 18:06:51 +00002612 def test_basics(self):
2613 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002614 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002615 with self.subTest(encoding=encoding):
2616 # generic codecs interface
2617 (o, size) = codecs.getencoder(encoding)(binput)
2618 self.assertEqual(size, len(binput))
2619 (i, size) = codecs.getdecoder(encoding)(o)
2620 self.assertEqual(size, len(o))
2621 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002622
Georg Brandl02524622010-12-02 18:06:51 +00002623 def test_read(self):
2624 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002625 with self.subTest(encoding=encoding):
2626 sin = codecs.encode(b"\x80", encoding)
2627 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2628 sout = reader.read()
2629 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002630
2631 def test_readline(self):
2632 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002633 with self.subTest(encoding=encoding):
2634 sin = codecs.encode(b"\x80", encoding)
2635 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2636 sout = reader.readline()
2637 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002638
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002639 def test_buffer_api_usage(self):
2640 # We check all the transform codecs accept memoryview input
2641 # for encoding and decoding
2642 # and also that they roundtrip correctly
2643 original = b"12345\x80"
2644 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002645 with self.subTest(encoding=encoding):
2646 data = original
2647 view = memoryview(data)
2648 data = codecs.encode(data, encoding)
2649 view_encoded = codecs.encode(view, encoding)
2650 self.assertEqual(view_encoded, data)
2651 view = memoryview(data)
2652 data = codecs.decode(data, encoding)
2653 self.assertEqual(data, original)
2654 view_decoded = codecs.decode(view, encoding)
2655 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002656
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002657 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002658 # Check binary -> binary codecs give a good error for str input
2659 bad_input = "bad input type"
2660 for encoding in bytes_transform_encodings:
2661 with self.subTest(encoding=encoding):
R David Murray44b548d2016-09-08 13:59:53 -04002662 fmt = (r"{!r} is not a text encoding; "
2663 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002664 msg = fmt.format(encoding)
2665 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002666 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002667 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002668
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002669 def test_text_to_binary_blacklists_text_transforms(self):
2670 # Check str.encode gives a good error message for str -> str codecs
2671 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002672 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002673 with self.assertRaisesRegex(LookupError, msg):
2674 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002675
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002676 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002677 # Check bytes.decode and bytearray.decode give a good error
2678 # message for binary -> binary codecs
2679 data = b"encode first to ensure we meet any format restrictions"
2680 for encoding in bytes_transform_encodings:
2681 with self.subTest(encoding=encoding):
2682 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002683 fmt = (r"{!r} is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002684 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002685 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002686 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002687 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002688 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002689 bytearray(encoded_data).decode(encoding)
2690
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002691 def test_binary_to_text_blacklists_text_transforms(self):
2692 # Check str -> str codec gives a good error for binary input
2693 for bad_input in (b"immutable", bytearray(b"mutable")):
2694 with self.subTest(bad_input=bad_input):
2695 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002696 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002697 with self.assertRaisesRegex(LookupError, msg) as failure:
2698 bad_input.decode("rot_13")
2699 self.assertIsNone(failure.exception.__cause__)
2700
Zachary Wareefa2e042013-12-30 14:54:11 -06002701 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002702 def test_custom_zlib_error_is_wrapped(self):
2703 # Check zlib codec gives a good error for malformed input
2704 msg = "^decoding with 'zlib_codec' codec failed"
2705 with self.assertRaisesRegex(Exception, msg) as failure:
2706 codecs.decode(b"hello", "zlib_codec")
2707 self.assertIsInstance(failure.exception.__cause__,
2708 type(failure.exception))
2709
2710 def test_custom_hex_error_is_wrapped(self):
2711 # Check hex codec gives a good error for malformed input
2712 msg = "^decoding with 'hex_codec' codec failed"
2713 with self.assertRaisesRegex(Exception, msg) as failure:
2714 codecs.decode(b"hello", "hex_codec")
2715 self.assertIsInstance(failure.exception.__cause__,
2716 type(failure.exception))
2717
2718 # Unfortunately, the bz2 module throws OSError, which the codec
2719 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002720
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002721 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2722 def test_aliases(self):
2723 for codec_name, aliases in transform_aliases.items():
2724 expected_name = codecs.lookup(codec_name).name
2725 for alias in aliases:
2726 with self.subTest(alias=alias):
2727 info = codecs.lookup(alias)
2728 self.assertEqual(info.name, expected_name)
2729
Martin Panter06171bd2015-09-12 00:34:28 +00002730 def test_quopri_stateless(self):
2731 # Should encode with quotetabs=True
2732 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2733 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2734 # But should still support unescaped tabs and spaces
2735 unescaped = b"space tab eol\n"
2736 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2737
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002738 def test_uu_invalid(self):
2739 # Missing "begin" line
2740 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2741
Nick Coghlan8b097b42013-11-13 23:49:21 +10002742
2743# The codec system tries to wrap exceptions in order to ensure the error
2744# mentions the operation being performed and the codec involved. We
2745# currently *only* want this to happen for relatively stateless
2746# exceptions, where the only significant information they contain is their
2747# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002748
2749# Use a local codec registry to avoid appearing to leak objects when
Martin Panter119e5022016-04-16 09:28:57 +00002750# registering multiple search functions
Nick Coghlan4e553e22013-11-16 00:35:34 +10002751_TEST_CODECS = {}
2752
2753def _get_test_codec(codec_name):
2754 return _TEST_CODECS.get(codec_name)
2755codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2756
Nick Coghlan8fad1672014-09-15 23:50:44 +12002757try:
2758 # Issue #22166: Also need to clear the internal cache in CPython
2759 from _codecs import _forget_codec
2760except ImportError:
2761 def _forget_codec(codec_name):
2762 pass
2763
2764
Nick Coghlan8b097b42013-11-13 23:49:21 +10002765class ExceptionChainingTest(unittest.TestCase):
2766
2767 def setUp(self):
2768 # There's no way to unregister a codec search function, so we just
2769 # ensure we render this one fairly harmless after the test
2770 # case finishes by using the test case repr as the codec name
2771 # The codecs module normalizes codec names, although this doesn't
2772 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002773 # We also make sure we use a truly unique id for the custom codec
2774 # to avoid issues with the codec cache when running these tests
2775 # multiple times (e.g. when hunting for refleaks)
2776 unique_id = repr(self) + str(id(self))
2777 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2778
2779 # We store the object to raise on the instance because of a bad
2780 # interaction between the codec caching (which means we can't
2781 # recreate the codec entry) and regrtest refleak hunting (which
2782 # runs the same test instance multiple times). This means we
2783 # need to ensure the codecs call back in to the instance to find
2784 # out which exception to raise rather than binding them in a
2785 # closure to an object that may change on the next run
2786 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002787
Nick Coghlan4e553e22013-11-16 00:35:34 +10002788 def tearDown(self):
2789 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002790 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2791 encodings._cache.pop(self.codec_name, None)
2792 try:
2793 _forget_codec(self.codec_name)
2794 except KeyError:
2795 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002796
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002797 def set_codec(self, encode, decode):
2798 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002799 name=self.codec_name)
2800 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002801
2802 @contextlib.contextmanager
2803 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002804 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002805 operation, self.codec_name, exc_type.__name__, msg)
2806 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2807 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002808 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002809 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002810
2811 def raise_obj(self, *args, **kwds):
2812 # Helper to dynamically change the object raised by a test codec
2813 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002814
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002815 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002816 self.obj_to_raise = obj_to_raise
2817 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002818 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002819 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002820 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002821 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002822 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002823 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002824 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002825 codecs.decode(b"bytes input", self.codec_name)
2826
2827 def test_raise_by_type(self):
2828 self.check_wrapped(RuntimeError, "")
2829
2830 def test_raise_by_value(self):
2831 msg = "This should be wrapped"
2832 self.check_wrapped(RuntimeError(msg), msg)
2833
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002834 def test_raise_grandchild_subclass_exact_size(self):
2835 msg = "This should be wrapped"
2836 class MyRuntimeError(RuntimeError):
2837 __slots__ = ()
2838 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2839
2840 def test_raise_subclass_with_weakref_support(self):
2841 msg = "This should be wrapped"
2842 class MyRuntimeError(RuntimeError):
2843 pass
2844 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2845
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002846 def check_not_wrapped(self, obj_to_raise, msg):
2847 def raise_obj(*args, **kwds):
2848 raise obj_to_raise
2849 self.set_codec(raise_obj, raise_obj)
2850 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002851 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002852 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002853 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002854 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002855 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002856 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002857 codecs.decode(b"bytes input", self.codec_name)
2858
2859 def test_init_override_is_not_wrapped(self):
2860 class CustomInit(RuntimeError):
2861 def __init__(self):
2862 pass
2863 self.check_not_wrapped(CustomInit, "")
2864
2865 def test_new_override_is_not_wrapped(self):
2866 class CustomNew(RuntimeError):
2867 def __new__(cls):
2868 return super().__new__(cls)
2869 self.check_not_wrapped(CustomNew, "")
2870
2871 def test_instance_attribute_is_not_wrapped(self):
2872 msg = "This should NOT be wrapped"
2873 exc = RuntimeError(msg)
2874 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002875 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002876
2877 def test_non_str_arg_is_not_wrapped(self):
2878 self.check_not_wrapped(RuntimeError(1), "1")
2879
2880 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002881 msg_re = r"^\('a', 'b', 'c'\)$"
2882 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002883
2884 # http://bugs.python.org/issue19609
2885 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002886 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002887 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002888 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002889 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002890 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002891 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002892 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002893 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002894 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002895 codecs.decode(b"bytes input", self.codec_name)
2896
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002897 def test_unflagged_non_text_codec_handling(self):
2898 # The stdlib non-text codecs are now marked so they're
2899 # pre-emptively skipped by the text model related methods
2900 # However, third party codecs won't be flagged, so we still make
2901 # sure the case where an inappropriate output type is produced is
2902 # handled appropriately
2903 def encode_to_str(*args, **kwds):
2904 return "not bytes!", 0
2905 def decode_to_bytes(*args, **kwds):
2906 return b"not str!", 0
2907 self.set_codec(encode_to_str, decode_to_bytes)
2908 # No input or output type checks on the codecs module functions
2909 encoded = codecs.encode(None, self.codec_name)
2910 self.assertEqual(encoded, "not bytes!")
2911 decoded = codecs.decode(None, self.codec_name)
2912 self.assertEqual(decoded, b"not str!")
2913 # Text model methods should complain
2914 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
R David Murray44b548d2016-09-08 13:59:53 -04002915 r"use codecs.encode\(\) to encode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002916 msg = fmt.format(self.codec_name)
2917 with self.assertRaisesRegex(TypeError, msg):
2918 "str_input".encode(self.codec_name)
2919 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
R David Murray44b548d2016-09-08 13:59:53 -04002920 r"use codecs.decode\(\) to decode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002921 msg = fmt.format(self.codec_name)
2922 with self.assertRaisesRegex(TypeError, msg):
2923 b"bytes input".decode(self.codec_name)
2924
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002925
Georg Brandl02524622010-12-02 18:06:51 +00002926
Victor Stinner62be4fb2011-10-18 21:46:37 +02002927@unittest.skipUnless(sys.platform == 'win32',
2928 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002929class CodePageTest(unittest.TestCase):
2930 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002931
Victor Stinner3a50e702011-10-18 21:21:00 +02002932 def test_invalid_code_page(self):
2933 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2934 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002935 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2936 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002937
2938 def test_code_page_name(self):
2939 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2940 codecs.code_page_encode, 932, '\xff')
2941 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002942 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002943 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002944 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002945
2946 def check_decode(self, cp, tests):
2947 for raw, errors, expected in tests:
2948 if expected is not None:
2949 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01002950 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002951 except UnicodeDecodeError as err:
2952 self.fail('Unable to decode %a from "cp%s" with '
2953 'errors=%r: %s' % (raw, cp, errors, err))
2954 self.assertEqual(decoded[0], expected,
2955 '%a.decode("cp%s", %r)=%a != %a'
2956 % (raw, cp, errors, decoded[0], expected))
2957 # assert 0 <= decoded[1] <= len(raw)
2958 self.assertGreaterEqual(decoded[1], 0)
2959 self.assertLessEqual(decoded[1], len(raw))
2960 else:
2961 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01002962 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002963
2964 def check_encode(self, cp, tests):
2965 for text, errors, expected in tests:
2966 if expected is not None:
2967 try:
2968 encoded = codecs.code_page_encode(cp, text, errors)
2969 except UnicodeEncodeError as err:
2970 self.fail('Unable to encode %a to "cp%s" with '
2971 'errors=%r: %s' % (text, cp, errors, err))
2972 self.assertEqual(encoded[0], expected,
2973 '%a.encode("cp%s", %r)=%a != %a'
2974 % (text, cp, errors, encoded[0], expected))
2975 self.assertEqual(encoded[1], len(text))
2976 else:
2977 self.assertRaises(UnicodeEncodeError,
2978 codecs.code_page_encode, cp, text, errors)
2979
2980 def test_cp932(self):
2981 self.check_encode(932, (
2982 ('abc', 'strict', b'abc'),
2983 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002984 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002985 ('\xff', 'strict', None),
2986 ('[\xff]', 'ignore', b'[]'),
2987 ('[\xff]', 'replace', b'[y]'),
2988 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002989 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02002990 ('[\xff]', 'namereplace',
2991 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002992 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002993 ('\udcff', 'strict', None),
2994 ('[\udcff]', 'surrogateescape', b'[\xff]'),
2995 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002996 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002997 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002998 (b'abc', 'strict', 'abc'),
2999 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
3000 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003001 (b'[\xff]', 'strict', None),
3002 (b'[\xff]', 'ignore', '[]'),
3003 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02003004 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003005 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003006 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003007 (b'\x81\x00abc', 'strict', None),
3008 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003009 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01003010 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003011 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003012
3013 def test_cp1252(self):
3014 self.check_encode(1252, (
3015 ('abc', 'strict', b'abc'),
3016 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
3017 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003018 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003019 ('\u0141', 'strict', None),
3020 ('\u0141', 'ignore', b''),
3021 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003022 ('\udc98', 'surrogateescape', b'\x98'),
3023 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003024 ))
3025 self.check_decode(1252, (
3026 (b'abc', 'strict', 'abc'),
3027 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3028 (b'\xff', 'strict', '\xff'),
3029 ))
3030
3031 def test_cp_utf7(self):
3032 cp = 65000
3033 self.check_encode(cp, (
3034 ('abc', 'strict', b'abc'),
3035 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
3036 ('\U0010ffff', 'strict', b'+2//f/w-'),
3037 ('\udc80', 'strict', b'+3IA-'),
3038 ('\ufffd', 'strict', b'+//0-'),
3039 ))
3040 self.check_decode(cp, (
3041 (b'abc', 'strict', 'abc'),
3042 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3043 (b'+2//f/w-', 'strict', '\U0010ffff'),
3044 (b'+3IA-', 'strict', '\udc80'),
3045 (b'+//0-', 'strict', '\ufffd'),
3046 # invalid bytes
3047 (b'[+/]', 'strict', '[]'),
3048 (b'[\xff]', 'strict', '[\xff]'),
3049 ))
3050
Victor Stinner3a50e702011-10-18 21:21:00 +02003051 def test_multibyte_encoding(self):
3052 self.check_decode(932, (
3053 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3054 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3055 ))
3056 self.check_decode(self.CP_UTF8, (
3057 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3058 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3059 ))
Steve Dowerf5aba582016-09-06 19:42:27 -07003060 self.check_encode(self.CP_UTF8, (
3061 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3062 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3063 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003064
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003065 def test_code_page_decode_flags(self):
3066 # Issue #36312: For some code pages (e.g. UTF-7) flags for
3067 # MultiByteToWideChar() must be set to 0.
Paul Monson62dfd7d2019-04-25 11:36:45 -07003068 if support.verbose:
3069 sys.stdout.write('\n')
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003070 for cp in (50220, 50221, 50222, 50225, 50227, 50229,
3071 *range(57002, 57011+1), 65000):
Paul Monson62dfd7d2019-04-25 11:36:45 -07003072 # On small versions of Windows like Windows IoT
3073 # not all codepages are present.
3074 # A missing codepage causes an OSError exception
3075 # so check for the codepage before decoding
3076 if is_code_page_present(cp):
3077 self.assertEqual(codecs.code_page_decode(cp, b'abc'), ('abc', 3), f'cp{cp}')
3078 else:
3079 if support.verbose:
3080 print(f" skipping cp={cp}")
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003081 self.assertEqual(codecs.code_page_decode(42, b'abc'),
3082 ('\uf061\uf062\uf063', 3))
3083
Victor Stinner3a50e702011-10-18 21:21:00 +02003084 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003085 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3086 self.assertEqual(decoded, ('', 0))
3087
Victor Stinner3a50e702011-10-18 21:21:00 +02003088 decoded = codecs.code_page_decode(932,
3089 b'\xe9\x80\xe9', 'strict',
3090 False)
3091 self.assertEqual(decoded, ('\u9a3e', 2))
3092
3093 decoded = codecs.code_page_decode(932,
3094 b'\xe9\x80\xe9\x80', 'strict',
3095 False)
3096 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3097
3098 decoded = codecs.code_page_decode(932,
3099 b'abc', 'strict',
3100 False)
3101 self.assertEqual(decoded, ('abc', 3))
3102
Steve Dowerf5aba582016-09-06 19:42:27 -07003103 def test_mbcs_alias(self):
3104 # Check that looking up our 'default' codepage will return
3105 # mbcs when we don't have a more specific one available
Victor Stinner91106cd2017-12-13 12:29:09 +01003106 with mock.patch('_winapi.GetACP', return_value=123):
Steve Dowerf5aba582016-09-06 19:42:27 -07003107 codec = codecs.lookup('cp123')
3108 self.assertEqual(codec.name, 'mbcs')
Steve Dowerf5aba582016-09-06 19:42:27 -07003109
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003110 @support.bigmemtest(size=2**31, memuse=7, dry_run=False)
Steve Dower7ebdda02019-08-21 16:22:33 -07003111 def test_large_input(self, size):
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003112 # Test input longer than INT_MAX.
3113 # Input should contain undecodable bytes before and after
3114 # the INT_MAX limit.
Steve Dower7ebdda02019-08-21 16:22:33 -07003115 encoded = (b'01234567' * ((size//8)-1) +
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003116 b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff')
Steve Dower7ebdda02019-08-21 16:22:33 -07003117 self.assertEqual(len(encoded), size+2)
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003118 decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True)
3119 self.assertEqual(decoded[1], len(encoded))
3120 del encoded
3121 self.assertEqual(len(decoded[0]), decoded[1])
3122 self.assertEqual(decoded[0][:10], '0123456701')
3123 self.assertEqual(decoded[0][-20:],
3124 '6701234567'
3125 '\udc85\udc86\udcea\udceb\udcec'
3126 '\udcef\udcfc\udcfd\udcfe\udcff')
3127
Steve Dower7ebdda02019-08-21 16:22:33 -07003128 @support.bigmemtest(size=2**31, memuse=6, dry_run=False)
3129 def test_large_utf8_input(self, size):
3130 # Test input longer than INT_MAX.
3131 # Input should contain a decodable multi-byte character
3132 # surrounding INT_MAX
3133 encoded = (b'0123456\xed\x84\x80' * (size//8))
3134 self.assertEqual(len(encoded), size // 8 * 10)
3135 decoded = codecs.code_page_decode(65001, encoded, 'ignore', True)
3136 self.assertEqual(decoded[1], len(encoded))
3137 del encoded
3138 self.assertEqual(len(decoded[0]), size)
3139 self.assertEqual(decoded[0][:10], '0123456\ud10001')
3140 self.assertEqual(decoded[0][-11:], '56\ud1000123456\ud100')
3141
Victor Stinner3a50e702011-10-18 21:21:00 +02003142
Victor Stinnerf96418d2015-09-21 23:06:27 +02003143class ASCIITest(unittest.TestCase):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003144 def test_encode(self):
3145 self.assertEqual('abc123'.encode('ascii'), b'abc123')
3146
3147 def test_encode_error(self):
3148 for data, error_handler, expected in (
3149 ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3150 ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3151 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003152 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3153 b'[\\x80\\xff\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003154 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3155 ):
3156 with self.subTest(data=data, error_handler=error_handler,
3157 expected=expected):
3158 self.assertEqual(data.encode('ascii', error_handler),
3159 expected)
3160
3161 def test_encode_surrogateescape_error(self):
3162 with self.assertRaises(UnicodeEncodeError):
3163 # the first character can be decoded, but not the second
3164 '\udc80\xff'.encode('ascii', 'surrogateescape')
3165
Victor Stinnerf96418d2015-09-21 23:06:27 +02003166 def test_decode(self):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003167 self.assertEqual(b'abc'.decode('ascii'), 'abc')
3168
3169 def test_decode_error(self):
Victor Stinnerf96418d2015-09-21 23:06:27 +02003170 for data, error_handler, expected in (
3171 (b'[\x80\xff]', 'ignore', '[]'),
3172 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3173 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3174 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3175 ):
3176 with self.subTest(data=data, error_handler=error_handler,
3177 expected=expected):
3178 self.assertEqual(data.decode('ascii', error_handler),
3179 expected)
3180
3181
Victor Stinnerc3713e92015-09-29 12:32:13 +02003182class Latin1Test(unittest.TestCase):
3183 def test_encode(self):
3184 for data, expected in (
3185 ('abc', b'abc'),
3186 ('\x80\xe9\xff', b'\x80\xe9\xff'),
3187 ):
3188 with self.subTest(data=data, expected=expected):
3189 self.assertEqual(data.encode('latin1'), expected)
3190
3191 def test_encode_errors(self):
3192 for data, error_handler, expected in (
3193 ('[\u20ac\udc80]', 'ignore', b'[]'),
3194 ('[\u20ac\udc80]', 'replace', b'[??]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003195 ('[\u20ac\U000abcde]', 'backslashreplace',
3196 b'[\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003197 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3198 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3199 ):
3200 with self.subTest(data=data, error_handler=error_handler,
3201 expected=expected):
3202 self.assertEqual(data.encode('latin1', error_handler),
3203 expected)
3204
3205 def test_encode_surrogateescape_error(self):
3206 with self.assertRaises(UnicodeEncodeError):
3207 # the first character can be decoded, but not the second
3208 '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3209
3210 def test_decode(self):
3211 for data, expected in (
3212 (b'abc', 'abc'),
3213 (b'[\x80\xff]', '[\x80\xff]'),
3214 ):
3215 with self.subTest(data=data, expected=expected):
3216 self.assertEqual(data.decode('latin1'), expected)
3217
3218
Jelle Zijlstrab3be4072019-05-22 08:18:26 -07003219class StreamRecoderTest(unittest.TestCase):
3220 def test_writelines(self):
3221 bio = io.BytesIO()
3222 codec = codecs.lookup('ascii')
3223 sr = codecs.StreamRecoder(bio, codec.encode, codec.decode,
3224 encodings.ascii.StreamReader, encodings.ascii.StreamWriter)
3225 sr.writelines([b'a', b'b'])
3226 self.assertEqual(bio.getvalue(), b'ab')
3227
3228 def test_write(self):
3229 bio = io.BytesIO()
3230 codec = codecs.lookup('latin1')
3231 # Recode from Latin-1 to utf-8.
3232 sr = codecs.StreamRecoder(bio, codec.encode, codec.decode,
3233 encodings.utf_8.StreamReader, encodings.utf_8.StreamWriter)
3234
3235 text = 'àñé'
3236 sr.write(text.encode('latin1'))
3237 self.assertEqual(bio.getvalue(), text.encode('utf-8'))
3238
Ammar Askara6ec1ce2019-05-31 12:44:01 -07003239 def test_seeking_read(self):
3240 bio = io.BytesIO('line1\nline2\nline3\n'.encode('utf-16-le'))
3241 sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le')
3242
3243 self.assertEqual(sr.readline(), b'line1\n')
3244 sr.seek(0)
3245 self.assertEqual(sr.readline(), b'line1\n')
3246 self.assertEqual(sr.readline(), b'line2\n')
3247 self.assertEqual(sr.readline(), b'line3\n')
3248 self.assertEqual(sr.readline(), b'')
3249
3250 def test_seeking_write(self):
3251 bio = io.BytesIO('123456789\n'.encode('utf-16-le'))
3252 sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le')
3253
3254 # Test that seek() only resets its internal buffer when offset
3255 # and whence are zero.
3256 sr.seek(2)
3257 sr.write(b'\nabc\n')
3258 self.assertEqual(sr.readline(), b'789\n')
3259 sr.seek(0)
3260 self.assertEqual(sr.readline(), b'1\n')
3261 self.assertEqual(sr.readline(), b'abc\n')
3262 self.assertEqual(sr.readline(), b'789\n')
3263
Jelle Zijlstrab3be4072019-05-22 08:18:26 -07003264
Victor Stinner3d4226a2018-08-29 22:21:32 +02003265@unittest.skipIf(_testcapi is None, 'need _testcapi module')
3266class LocaleCodecTest(unittest.TestCase):
3267 """
3268 Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex().
3269 """
3270 ENCODING = sys.getfilesystemencoding()
3271 STRINGS = ("ascii", "ulatin1:\xa7\xe9",
3272 "u255:\xff",
3273 "UCS:\xe9\u20ac\U0010ffff",
3274 "surrogates:\uDC80\uDCFF")
3275 BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff")
3276 SURROGATES = "\uDC80\uDCFF"
3277
3278 def encode(self, text, errors="strict"):
3279 return _testcapi.EncodeLocaleEx(text, 0, errors)
3280
3281 def check_encode_strings(self, errors):
3282 for text in self.STRINGS:
3283 with self.subTest(text=text):
3284 try:
3285 expected = text.encode(self.ENCODING, errors)
3286 except UnicodeEncodeError:
3287 with self.assertRaises(RuntimeError) as cm:
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003288 self.encode(text, errors)
Victor Stinner3d4226a2018-08-29 22:21:32 +02003289 errmsg = str(cm.exception)
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003290 self.assertRegex(errmsg, r"encode error: pos=[0-9]+, reason=")
Victor Stinner3d4226a2018-08-29 22:21:32 +02003291 else:
3292 encoded = self.encode(text, errors)
3293 self.assertEqual(encoded, expected)
3294
3295 def test_encode_strict(self):
3296 self.check_encode_strings("strict")
3297
3298 def test_encode_surrogateescape(self):
3299 self.check_encode_strings("surrogateescape")
3300
3301 def test_encode_surrogatepass(self):
3302 try:
3303 self.encode('', 'surrogatepass')
3304 except ValueError as exc:
3305 if str(exc) == 'unsupported error handler':
3306 self.skipTest(f"{self.ENCODING!r} encoder doesn't support "
3307 f"surrogatepass error handler")
3308 else:
3309 raise
3310
3311 self.check_encode_strings("surrogatepass")
3312
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003313 def test_encode_unsupported_error_handler(self):
3314 with self.assertRaises(ValueError) as cm:
3315 self.encode('', 'backslashreplace')
3316 self.assertEqual(str(cm.exception), 'unsupported error handler')
3317
Victor Stinner3d4226a2018-08-29 22:21:32 +02003318 def decode(self, encoded, errors="strict"):
3319 return _testcapi.DecodeLocaleEx(encoded, 0, errors)
3320
3321 def check_decode_strings(self, errors):
3322 is_utf8 = (self.ENCODING == "utf-8")
3323 if is_utf8:
3324 encode_errors = 'surrogateescape'
3325 else:
3326 encode_errors = 'strict'
3327
3328 strings = list(self.BYTES_STRINGS)
3329 for text in self.STRINGS:
3330 try:
3331 encoded = text.encode(self.ENCODING, encode_errors)
3332 if encoded not in strings:
3333 strings.append(encoded)
3334 except UnicodeEncodeError:
3335 encoded = None
3336
3337 if is_utf8:
3338 encoded2 = text.encode(self.ENCODING, 'surrogatepass')
3339 if encoded2 != encoded:
3340 strings.append(encoded2)
3341
3342 for encoded in strings:
3343 with self.subTest(encoded=encoded):
3344 try:
3345 expected = encoded.decode(self.ENCODING, errors)
3346 except UnicodeDecodeError:
3347 with self.assertRaises(RuntimeError) as cm:
3348 self.decode(encoded, errors)
3349 errmsg = str(cm.exception)
3350 self.assertTrue(errmsg.startswith("decode error: "), errmsg)
3351 else:
3352 decoded = self.decode(encoded, errors)
3353 self.assertEqual(decoded, expected)
3354
3355 def test_decode_strict(self):
3356 self.check_decode_strings("strict")
3357
3358 def test_decode_surrogateescape(self):
3359 self.check_decode_strings("surrogateescape")
3360
3361 def test_decode_surrogatepass(self):
3362 try:
3363 self.decode(b'', 'surrogatepass')
3364 except ValueError as exc:
3365 if str(exc) == 'unsupported error handler':
3366 self.skipTest(f"{self.ENCODING!r} decoder doesn't support "
3367 f"surrogatepass error handler")
3368 else:
3369 raise
3370
3371 self.check_decode_strings("surrogatepass")
3372
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003373 def test_decode_unsupported_error_handler(self):
3374 with self.assertRaises(ValueError) as cm:
3375 self.decode(b'', 'backslashreplace')
3376 self.assertEqual(str(cm.exception), 'unsupported error handler')
3377
Victor Stinner3d4226a2018-08-29 22:21:32 +02003378
Zethb3b48c82019-09-09 15:50:36 +01003379class Rot13Test(unittest.TestCase):
3380 """Test the educational ROT-13 codec."""
3381 def test_encode(self):
3382 ciphertext = codecs.encode("Caesar liked ciphers", 'rot-13')
3383 self.assertEqual(ciphertext, 'Pnrfne yvxrq pvcuref')
3384
3385 def test_decode(self):
3386 plaintext = codecs.decode('Rg gh, Oehgr?', 'rot-13')
3387 self.assertEqual(plaintext, 'Et tu, Brute?')
3388
3389 def test_incremental_encode(self):
3390 encoder = codecs.getincrementalencoder('rot-13')()
3391 ciphertext = encoder.encode('ABBA nag Cheryl Baker')
3392 self.assertEqual(ciphertext, 'NOON ant Purely Onxre')
3393
3394 def test_incremental_decode(self):
3395 decoder = codecs.getincrementaldecoder('rot-13')()
3396 plaintext = decoder.decode('terra Ares envy tha')
3397 self.assertEqual(plaintext, 'green Nerf rail gun')
3398
3399
3400class Rot13UtilTest(unittest.TestCase):
3401 """Test the ROT-13 codec via rot13 function,
3402 i.e. the user has done something like:
3403 $ echo "Hello World" | python -m encodings.rot_13
3404 """
3405 def test_rot13_func(self):
3406 infile = io.StringIO('Gb or, be abg gb or, gung vf gur dhrfgvba')
3407 outfile = io.StringIO()
3408 encodings.rot_13.rot13(infile, outfile)
3409 outfile.seek(0)
3410 plain_text = outfile.read()
3411 self.assertEqual(
3412 plain_text,
3413 'To be, or not to be, that is the question')
3414
3415
Fred Drake2e2be372001-09-20 21:33:42 +00003416if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003417 unittest.main()