blob: dcdd574bc7f4d6f21afaa628c99a0074d5904d7f [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
Nick Coghlanc72e4e62013-11-22 22:39:36 +10007import encodings
Victor Stinner91106cd2017-12-13 12:29:09 +01008from unittest import mock
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Antoine Pitrou00b2c862011-10-05 13:01:41 +020012try:
Victor Stinner3d4226a2018-08-29 22:21:32 +020013 import _testcapi
Pablo Galindo293dd232019-11-19 21:34:03 +000014except ImportError:
Victor Stinner3d4226a2018-08-29 22:21:32 +020015 _testcapi = None
16
17try:
Antoine Pitrou00b2c862011-10-05 13:01:41 +020018 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Paul Monson62dfd7d2019-04-25 11:36:45 -070030# On small versions of Windows like Windows IoT or Windows Nano Server not all codepages are present
31def is_code_page_present(cp):
Victor Stinner8f4ef3b2019-07-01 18:28:25 +020032 from ctypes import POINTER, WINFUNCTYPE, WinDLL
Paul Monson62dfd7d2019-04-25 11:36:45 -070033 from ctypes.wintypes import BOOL, UINT, BYTE, WCHAR, UINT, DWORD
34
35 MAX_LEADBYTES = 12 # 5 ranges, 2 bytes ea., 0 term.
36 MAX_DEFAULTCHAR = 2 # single or double byte
37 MAX_PATH = 260
38 class CPINFOEXW(ctypes.Structure):
39 _fields_ = [("MaxCharSize", UINT),
40 ("DefaultChar", BYTE*MAX_DEFAULTCHAR),
41 ("LeadByte", BYTE*MAX_LEADBYTES),
42 ("UnicodeDefaultChar", WCHAR),
43 ("CodePage", UINT),
44 ("CodePageName", WCHAR*MAX_PATH)]
45
46 prototype = WINFUNCTYPE(BOOL, UINT, DWORD, POINTER(CPINFOEXW))
47 GetCPInfoEx = prototype(("GetCPInfoExW", WinDLL("kernel32")))
48 info = CPINFOEXW()
49 return GetCPInfoEx(cp, 0, info)
Victor Stinnerf96418d2015-09-21 23:06:27 +020050
Walter Dörwald69652032004-09-07 20:24:22 +000051class Queue(object):
52 """
53 queue: write bytes at one end, read bytes from the other end
54 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000055 def __init__(self, buffer):
56 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000057
58 def write(self, chars):
59 self._buffer += chars
60
61 def read(self, size=-1):
62 if size<0:
63 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000064 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000065 return s
66 else:
67 s = self._buffer[:size]
68 self._buffer = self._buffer[size:]
69 return s
70
Victor Stinnerf96418d2015-09-21 23:06:27 +020071
Walter Dörwald3abcb012007-04-16 22:10:50 +000072class MixInCheckStateHandling:
73 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000074 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000075 d = codecs.getincrementaldecoder(encoding)()
76 part1 = d.decode(s[:i])
77 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000078 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000079 # Check that the condition stated in the documentation for
80 # IncrementalDecoder.getstate() holds
81 if not state[1]:
82 # reset decoder to the default state without anything buffered
83 d.setstate((state[0][:0], 0))
84 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000085 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000086 # The decoder must return to the same state
87 self.assertEqual(state, d.getstate())
88 # Create a new decoder and set it to the state
89 # we extracted from the old one
90 d = codecs.getincrementaldecoder(encoding)()
91 d.setstate(state)
92 part2 = d.decode(s[i:], True)
93 self.assertEqual(u, part1+part2)
94
95 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000096 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000097 d = codecs.getincrementalencoder(encoding)()
98 part1 = d.encode(u[:i])
99 state = d.getstate()
100 d = codecs.getincrementalencoder(encoding)()
101 d.setstate(state)
102 part2 = d.encode(u[i:], True)
103 self.assertEqual(s, part1+part2)
104
Victor Stinnerf96418d2015-09-21 23:06:27 +0200105
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200106class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000107 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +0000108 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000109 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +0000110 # the StreamReader and check that the results equal the appropriate
111 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000112 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200113 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000114 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000115 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000116 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +0000117 result += r.read()
118 self.assertEqual(result, partialresult)
119 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000120 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000121 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +0000122
Martin Panter7462b6492015-11-02 03:37:02 +0000123 # do the check again, this time using an incremental decoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000124 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000125 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000126 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000127 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000128 self.assertEqual(result, partialresult)
129 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000130 self.assertEqual(d.decode(b"", True), "")
131 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000132
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000133 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000134 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000135 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000136 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000137 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000138 self.assertEqual(result, partialresult)
139 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000140 self.assertEqual(d.decode(b"", True), "")
141 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000142
143 # check iterdecode()
144 encoded = input.encode(self.encoding)
145 self.assertEqual(
146 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000147 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000148 )
149
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000150 def test_readline(self):
151 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000152 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000153 return codecs.getreader(self.encoding)(stream)
154
Walter Dörwaldca199432006-03-06 22:39:12 +0000155 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200156 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000157 lines = []
158 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000159 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000160 if not line:
161 break
162 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000163 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000164
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000165 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
166 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
167 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000168 self.assertEqual(readalllines(s, True), sexpected)
169 self.assertEqual(readalllines(s, False), sexpectednoends)
170 self.assertEqual(readalllines(s, True, 10), sexpected)
171 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000172
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200173 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000174 # Test long lines (multiple calls to read() in readline())
175 vw = []
176 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200177 for (i, lineend) in enumerate(lineends):
178 vw.append((i*200+200)*"\u3042" + lineend)
179 vwo.append((i*200+200)*"\u3042")
180 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
181 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000182
183 # Test lines where the first read might end with \r, so the
184 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000185 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200186 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000187 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000188 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000189 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000190 self.assertEqual(
191 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000192 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000193 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200194 self.assertEqual(
195 reader.readline(keepends=True),
196 "xxx\n",
197 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000198 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000199 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000200 self.assertEqual(
201 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000202 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000203 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200204 self.assertEqual(
205 reader.readline(keepends=False),
206 "xxx",
207 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000208
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200209 def test_mixed_readline_and_read(self):
210 lines = ["Humpty Dumpty sat on a wall,\n",
211 "Humpty Dumpty had a great fall.\r\n",
212 "All the king's horses and all the king's men\r",
213 "Couldn't put Humpty together again."]
214 data = ''.join(lines)
215 def getreader():
216 stream = io.BytesIO(data.encode(self.encoding))
217 return codecs.getreader(self.encoding)(stream)
218
219 # Issue #8260: Test readline() followed by read()
220 f = getreader()
221 self.assertEqual(f.readline(), lines[0])
222 self.assertEqual(f.read(), ''.join(lines[1:]))
223 self.assertEqual(f.read(), '')
224
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200225 # Issue #32110: Test readline() followed by read(n)
226 f = getreader()
227 self.assertEqual(f.readline(), lines[0])
228 self.assertEqual(f.read(1), lines[1][0])
229 self.assertEqual(f.read(0), '')
230 self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100])
231
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200232 # Issue #16636: Test readline() followed by readlines()
233 f = getreader()
234 self.assertEqual(f.readline(), lines[0])
235 self.assertEqual(f.readlines(), lines[1:])
236 self.assertEqual(f.read(), '')
237
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200238 # Test read(n) followed by read()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200239 f = getreader()
240 self.assertEqual(f.read(size=40, chars=5), data[:5])
241 self.assertEqual(f.read(), data[5:])
242 self.assertEqual(f.read(), '')
243
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200244 # Issue #32110: Test read(n) followed by read(n)
245 f = getreader()
246 self.assertEqual(f.read(size=40, chars=5), data[:5])
247 self.assertEqual(f.read(1), data[5])
248 self.assertEqual(f.read(0), '')
249 self.assertEqual(f.read(100), data[6:106])
250
251 # Issue #12446: Test read(n) followed by readlines()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200252 f = getreader()
253 self.assertEqual(f.read(size=40, chars=5), data[:5])
254 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
255 self.assertEqual(f.read(), '')
256
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000257 def test_bug1175396(self):
258 s = [
259 '<%!--===================================================\r\n',
260 ' BLOG index page: show recent articles,\r\n',
261 ' today\'s articles, or articles of a specific date.\r\n',
262 '========================================================--%>\r\n',
263 '<%@inputencoding="ISO-8859-1"%>\r\n',
264 '<%@pagetemplate=TEMPLATE.y%>\r\n',
265 '<%@import=import frog.util, frog%>\r\n',
266 '<%@import=import frog.objects%>\r\n',
267 '<%@import=from frog.storageerrors import StorageError%>\r\n',
268 '<%\r\n',
269 '\r\n',
270 'import logging\r\n',
271 'log=logging.getLogger("Snakelets.logger")\r\n',
272 '\r\n',
273 '\r\n',
274 'user=self.SessionCtx.user\r\n',
275 'storageEngine=self.SessionCtx.storageEngine\r\n',
276 '\r\n',
277 '\r\n',
278 'def readArticlesFromDate(date, count=None):\r\n',
279 ' entryids=storageEngine.listBlogEntries(date)\r\n',
280 ' entryids.reverse() # descending\r\n',
281 ' if count:\r\n',
282 ' entryids=entryids[:count]\r\n',
283 ' try:\r\n',
284 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
285 ' except StorageError,x:\r\n',
286 ' log.error("Error loading articles: "+str(x))\r\n',
287 ' self.abort("cannot load articles")\r\n',
288 '\r\n',
289 'showdate=None\r\n',
290 '\r\n',
291 'arg=self.Request.getArg()\r\n',
292 'if arg=="today":\r\n',
293 ' #-------------------- TODAY\'S ARTICLES\r\n',
294 ' self.write("<h2>Today\'s articles</h2>")\r\n',
295 ' showdate = frog.util.isodatestr() \r\n',
296 ' entries = readArticlesFromDate(showdate)\r\n',
297 'elif arg=="active":\r\n',
298 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
299 ' self.Yredirect("active.y")\r\n',
300 'elif arg=="login":\r\n',
301 ' #-------------------- LOGIN PAGE redirect\r\n',
302 ' self.Yredirect("login.y")\r\n',
303 'elif arg=="date":\r\n',
304 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
305 ' showdate = self.Request.getParameter("date")\r\n',
306 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
307 ' entries = readArticlesFromDate(showdate)\r\n',
308 'else:\r\n',
309 ' #-------------------- RECENT ARTICLES\r\n',
310 ' self.write("<h2>Recent articles</h2>")\r\n',
311 ' dates=storageEngine.listBlogEntryDates()\r\n',
312 ' if dates:\r\n',
313 ' entries=[]\r\n',
314 ' SHOWAMOUNT=10\r\n',
315 ' for showdate in dates:\r\n',
316 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
317 ' if len(entries)>=SHOWAMOUNT:\r\n',
318 ' break\r\n',
319 ' \r\n',
320 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000321 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200322 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000323 for (i, line) in enumerate(reader):
324 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000325
326 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000327 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200328 writer = codecs.getwriter(self.encoding)(q)
329 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000330
331 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000332 writer.write("foo\r")
333 self.assertEqual(reader.readline(keepends=False), "foo")
334 writer.write("\nbar\r")
335 self.assertEqual(reader.readline(keepends=False), "")
336 self.assertEqual(reader.readline(keepends=False), "bar")
337 writer.write("baz")
338 self.assertEqual(reader.readline(keepends=False), "baz")
339 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000340
341 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000342 writer.write("foo\r")
343 self.assertEqual(reader.readline(keepends=True), "foo\r")
344 writer.write("\nbar\r")
345 self.assertEqual(reader.readline(keepends=True), "\n")
346 self.assertEqual(reader.readline(keepends=True), "bar\r")
347 writer.write("baz")
348 self.assertEqual(reader.readline(keepends=True), "baz")
349 self.assertEqual(reader.readline(keepends=True), "")
350 writer.write("foo\r\n")
351 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000352
Walter Dörwald9fa09462005-01-10 12:01:39 +0000353 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000354 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
355 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
356 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000357
358 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000359 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200360 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000361 self.assertEqual(reader.readline(), s1)
362 self.assertEqual(reader.readline(), s2)
363 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000364 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000365
366 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000367 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
368 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
369 s3 = "stillokay:bbbbxx\r\n"
370 s4 = "broken!!!!badbad\r\n"
371 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000372
373 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000374 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200375 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000376 self.assertEqual(reader.readline(), s1)
377 self.assertEqual(reader.readline(), s2)
378 self.assertEqual(reader.readline(), s3)
379 self.assertEqual(reader.readline(), s4)
380 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000381 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000382
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200383 ill_formed_sequence_replace = "\ufffd"
384
385 def test_lone_surrogates(self):
386 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
387 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
388 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200389 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
390 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200391 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
392 "[&#56448;]".encode(self.encoding))
393 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
394 "[]".encode(self.encoding))
395 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
396 "[?]".encode(self.encoding))
397
Victor Stinner01ada392015-10-01 21:54:51 +0200398 # sequential surrogate characters
399 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
400 "[]".encode(self.encoding))
401 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
402 "[??]".encode(self.encoding))
403
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200404 bom = "".encode(self.encoding)
405 for before, after in [("\U00010fff", "A"), ("[", "]"),
406 ("A", "\U00010fff")]:
407 before_sequence = before.encode(self.encoding)[len(bom):]
408 after_sequence = after.encode(self.encoding)[len(bom):]
409 test_string = before + "\uDC80" + after
410 test_sequence = (bom + before_sequence +
411 self.ill_formed_sequence + after_sequence)
412 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
413 self.encoding)
414 self.assertEqual(test_string.encode(self.encoding,
415 "surrogatepass"),
416 test_sequence)
417 self.assertEqual(test_sequence.decode(self.encoding,
418 "surrogatepass"),
419 test_string)
420 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
421 before + after)
422 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
423 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200424 backslashreplace = ''.join('\\x%02x' % b
425 for b in self.ill_formed_sequence)
426 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
427 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200428
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +0200429 def test_incremental_surrogatepass(self):
430 # Test incremental decoder for surrogatepass handler:
431 # see issue #24214
Serhiy Storchaka894263b2019-06-25 11:54:18 +0300432 # High surrogate
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +0200433 data = '\uD901'.encode(self.encoding, 'surrogatepass')
434 for i in range(1, len(data)):
435 dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
436 self.assertEqual(dec.decode(data[:i]), '')
437 self.assertEqual(dec.decode(data[i:], True), '\uD901')
Serhiy Storchaka894263b2019-06-25 11:54:18 +0300438 # Low surrogate
439 data = '\uDC02'.encode(self.encoding, 'surrogatepass')
440 for i in range(1, len(data)):
441 dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
442 self.assertEqual(dec.decode(data[:i]), '')
443 self.assertEqual(dec.decode(data[i:]), '\uDC02')
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +0200444
Victor Stinnerf96418d2015-09-21 23:06:27 +0200445
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200446class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000447 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200448 if sys.byteorder == 'little':
449 ill_formed_sequence = b"\x80\xdc\x00\x00"
450 else:
451 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000452
453 spamle = (b'\xff\xfe\x00\x00'
454 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
455 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
456 spambe = (b'\x00\x00\xfe\xff'
457 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
458 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
459
460 def test_only_one_bom(self):
461 _,_,reader,writer = codecs.lookup(self.encoding)
462 # encode some stream
463 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200464 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000465 f.write("spam")
466 f.write("spam")
467 d = s.getvalue()
468 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000469 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000470 # try to read it back
471 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200472 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000473 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000474
475 def test_badbom(self):
476 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200477 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000478 self.assertRaises(UnicodeError, f.read)
479
480 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200481 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000482 self.assertRaises(UnicodeError, f.read)
483
484 def test_partial(self):
485 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200486 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000487 [
488 "", # first byte of BOM read
489 "", # second byte of BOM read
490 "", # third byte of BOM read
491 "", # fourth byte of BOM read => byteorder known
492 "",
493 "",
494 "",
495 "\x00",
496 "\x00",
497 "\x00",
498 "\x00",
499 "\x00\xff",
500 "\x00\xff",
501 "\x00\xff",
502 "\x00\xff",
503 "\x00\xff\u0100",
504 "\x00\xff\u0100",
505 "\x00\xff\u0100",
506 "\x00\xff\u0100",
507 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200508 "\x00\xff\u0100\uffff",
509 "\x00\xff\u0100\uffff",
510 "\x00\xff\u0100\uffff",
511 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000512 ]
513 )
514
Georg Brandl791f4e12009-09-17 11:41:24 +0000515 def test_handlers(self):
516 self.assertEqual(('\ufffd', 1),
517 codecs.utf_32_decode(b'\x01', 'replace', True))
518 self.assertEqual(('', 1),
519 codecs.utf_32_decode(b'\x01', 'ignore', True))
520
Walter Dörwald41980ca2007-08-16 21:55:45 +0000521 def test_errors(self):
522 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
523 b"\xff", "strict", True)
524
525 def test_decoder_state(self):
526 self.check_state_handling_decode(self.encoding,
527 "spamspam", self.spamle)
528 self.check_state_handling_decode(self.encoding,
529 "spamspam", self.spambe)
530
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000531 def test_issue8941(self):
532 # Issue #8941: insufficient result allocation when decoding into
533 # surrogate pairs on UCS-2 builds.
534 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
535 self.assertEqual('\U00010000' * 1024,
536 codecs.utf_32_decode(encoded_le)[0])
537 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
538 self.assertEqual('\U00010000' * 1024,
539 codecs.utf_32_decode(encoded_be)[0])
540
Victor Stinnerf96418d2015-09-21 23:06:27 +0200541
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200542class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000543 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200544 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000545
546 def test_partial(self):
547 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200548 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000549 [
550 "",
551 "",
552 "",
553 "\x00",
554 "\x00",
555 "\x00",
556 "\x00",
557 "\x00\xff",
558 "\x00\xff",
559 "\x00\xff",
560 "\x00\xff",
561 "\x00\xff\u0100",
562 "\x00\xff\u0100",
563 "\x00\xff\u0100",
564 "\x00\xff\u0100",
565 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200566 "\x00\xff\u0100\uffff",
567 "\x00\xff\u0100\uffff",
568 "\x00\xff\u0100\uffff",
569 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000570 ]
571 )
572
573 def test_simple(self):
574 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
575
576 def test_errors(self):
577 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
578 b"\xff", "strict", True)
579
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000580 def test_issue8941(self):
581 # Issue #8941: insufficient result allocation when decoding into
582 # surrogate pairs on UCS-2 builds.
583 encoded = b'\x00\x00\x01\x00' * 1024
584 self.assertEqual('\U00010000' * 1024,
585 codecs.utf_32_le_decode(encoded)[0])
586
Victor Stinnerf96418d2015-09-21 23:06:27 +0200587
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200588class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000589 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200590 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000591
592 def test_partial(self):
593 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200594 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000595 [
596 "",
597 "",
598 "",
599 "\x00",
600 "\x00",
601 "\x00",
602 "\x00",
603 "\x00\xff",
604 "\x00\xff",
605 "\x00\xff",
606 "\x00\xff",
607 "\x00\xff\u0100",
608 "\x00\xff\u0100",
609 "\x00\xff\u0100",
610 "\x00\xff\u0100",
611 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200612 "\x00\xff\u0100\uffff",
613 "\x00\xff\u0100\uffff",
614 "\x00\xff\u0100\uffff",
615 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000616 ]
617 )
618
619 def test_simple(self):
620 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
621
622 def test_errors(self):
623 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
624 b"\xff", "strict", True)
625
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000626 def test_issue8941(self):
627 # Issue #8941: insufficient result allocation when decoding into
628 # surrogate pairs on UCS-2 builds.
629 encoded = b'\x00\x01\x00\x00' * 1024
630 self.assertEqual('\U00010000' * 1024,
631 codecs.utf_32_be_decode(encoded)[0])
632
633
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200634class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000635 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200636 if sys.byteorder == 'little':
637 ill_formed_sequence = b"\x80\xdc"
638 else:
639 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000640
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000641 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
642 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000643
644 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000645 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000646 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000647 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200648 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000649 f.write("spam")
650 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000651 d = s.getvalue()
652 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000653 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000654 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000655 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200656 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000657 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000658
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000659 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000660 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200661 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000662 self.assertRaises(UnicodeError, f.read)
663
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000664 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200665 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000666 self.assertRaises(UnicodeError, f.read)
667
Walter Dörwald69652032004-09-07 20:24:22 +0000668 def test_partial(self):
669 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200670 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000671 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000672 "", # first byte of BOM read
673 "", # second byte of BOM read => byteorder known
674 "",
675 "\x00",
676 "\x00",
677 "\x00\xff",
678 "\x00\xff",
679 "\x00\xff\u0100",
680 "\x00\xff\u0100",
681 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200682 "\x00\xff\u0100\uffff",
683 "\x00\xff\u0100\uffff",
684 "\x00\xff\u0100\uffff",
685 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000686 ]
687 )
688
Georg Brandl791f4e12009-09-17 11:41:24 +0000689 def test_handlers(self):
690 self.assertEqual(('\ufffd', 1),
691 codecs.utf_16_decode(b'\x01', 'replace', True))
692 self.assertEqual(('', 1),
693 codecs.utf_16_decode(b'\x01', 'ignore', True))
694
Walter Dörwalde22d3392005-11-17 08:52:34 +0000695 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000696 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000697 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000698
699 def test_decoder_state(self):
700 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000701 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000702 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000703 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000704
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000705 def test_bug691291(self):
706 # Files are always opened in binary mode, even if no binary mode was
707 # specified. This means that no automatic conversion of '\n' is done
708 # on reading and writing.
709 s1 = 'Hello\r\nworld\r\n'
710
711 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200712 self.addCleanup(support.unlink, support.TESTFN)
713 with open(support.TESTFN, 'wb') as fp:
714 fp.write(s)
Victor Stinnere471e722019-10-28 15:40:08 +0100715 with codecs.open(support.TESTFN, 'r',
716 encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200717 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000718
Victor Stinnere471e722019-10-28 15:40:08 +0100719 def test_invalid_modes(self):
720 for mode in ('U', 'rU', 'r+U'):
721 with self.assertRaises(ValueError) as cm:
722 codecs.open(support.TESTFN, mode, encoding=self.encoding)
723 self.assertIn('invalid mode', str(cm.exception))
724
725 for mode in ('rt', 'wt', 'at', 'r+t'):
726 with self.assertRaises(ValueError) as cm:
727 codecs.open(support.TESTFN, mode, encoding=self.encoding)
728 self.assertIn("can't have text and binary mode at once",
729 str(cm.exception))
730
731
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200732class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000733 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200734 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000735
736 def test_partial(self):
737 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200738 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000739 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000740 "",
741 "\x00",
742 "\x00",
743 "\x00\xff",
744 "\x00\xff",
745 "\x00\xff\u0100",
746 "\x00\xff\u0100",
747 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200748 "\x00\xff\u0100\uffff",
749 "\x00\xff\u0100\uffff",
750 "\x00\xff\u0100\uffff",
751 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000752 ]
753 )
754
Walter Dörwalde22d3392005-11-17 08:52:34 +0000755 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200756 tests = [
757 (b'\xff', '\ufffd'),
758 (b'A\x00Z', 'A\ufffd'),
759 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
760 (b'\x00\xd8', '\ufffd'),
761 (b'\x00\xd8A', '\ufffd'),
762 (b'\x00\xd8A\x00', '\ufffdA'),
763 (b'\x00\xdcA\x00', '\ufffdA'),
764 ]
765 for raw, expected in tests:
766 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
767 raw, 'strict', True)
768 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000769
Victor Stinner53a9dd72010-12-08 22:25:45 +0000770 def test_nonbmp(self):
771 self.assertEqual("\U00010203".encode(self.encoding),
772 b'\x00\xd8\x03\xde')
773 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
774 "\U00010203")
775
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200776class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000777 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200778 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000779
780 def test_partial(self):
781 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200782 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000783 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000784 "",
785 "\x00",
786 "\x00",
787 "\x00\xff",
788 "\x00\xff",
789 "\x00\xff\u0100",
790 "\x00\xff\u0100",
791 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200792 "\x00\xff\u0100\uffff",
793 "\x00\xff\u0100\uffff",
794 "\x00\xff\u0100\uffff",
795 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000796 ]
797 )
798
Walter Dörwalde22d3392005-11-17 08:52:34 +0000799 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200800 tests = [
801 (b'\xff', '\ufffd'),
802 (b'\x00A\xff', 'A\ufffd'),
803 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
804 (b'\xd8\x00', '\ufffd'),
805 (b'\xd8\x00\xdc', '\ufffd'),
806 (b'\xd8\x00\x00A', '\ufffdA'),
807 (b'\xdc\x00\x00A', '\ufffdA'),
808 ]
809 for raw, expected in tests:
810 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
811 raw, 'strict', True)
812 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000813
Victor Stinner53a9dd72010-12-08 22:25:45 +0000814 def test_nonbmp(self):
815 self.assertEqual("\U00010203".encode(self.encoding),
816 b'\xd8\x00\xde\x03')
817 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
818 "\U00010203")
819
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200820class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000821 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200822 ill_formed_sequence = b"\xed\xb2\x80"
823 ill_formed_sequence_replace = "\ufffd" * 3
Victor Stinner01ada392015-10-01 21:54:51 +0200824 BOM = b''
Walter Dörwald69652032004-09-07 20:24:22 +0000825
826 def test_partial(self):
827 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200828 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000829 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000830 "\x00",
831 "\x00",
832 "\x00\xff",
833 "\x00\xff",
834 "\x00\xff\u07ff",
835 "\x00\xff\u07ff",
836 "\x00\xff\u07ff",
837 "\x00\xff\u07ff\u0800",
838 "\x00\xff\u07ff\u0800",
839 "\x00\xff\u07ff\u0800",
840 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200841 "\x00\xff\u07ff\u0800\uffff",
842 "\x00\xff\u07ff\u0800\uffff",
843 "\x00\xff\u07ff\u0800\uffff",
844 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000845 ]
846 )
847
Walter Dörwald3abcb012007-04-16 22:10:50 +0000848 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000849 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000850 self.check_state_handling_decode(self.encoding,
851 u, u.encode(self.encoding))
852
Victor Stinner1d65d912015-10-05 13:43:50 +0200853 def test_decode_error(self):
854 for data, error_handler, expected in (
855 (b'[\x80\xff]', 'ignore', '[]'),
856 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
857 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
858 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
859 ):
860 with self.subTest(data=data, error_handler=error_handler,
861 expected=expected):
862 self.assertEqual(data.decode(self.encoding, error_handler),
863 expected)
864
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000865 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200866 super().test_lone_surrogates()
867 # not sure if this is making sense for
868 # UTF-16 and UTF-32
Victor Stinner01ada392015-10-01 21:54:51 +0200869 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
870 self.BOM + b'[\x80]')
871
872 with self.assertRaises(UnicodeEncodeError) as cm:
873 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
874 exc = cm.exception
875 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000876
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000877 def test_surrogatepass_handler(self):
Victor Stinner01ada392015-10-01 21:54:51 +0200878 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
879 self.BOM + b"abc\xed\xa0\x80def")
880 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
881 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
882 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
883 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
884
885 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
Ezio Melottib3aedd42010-11-20 19:04:17 +0000886 "abc\ud800def")
Victor Stinner01ada392015-10-01 21:54:51 +0200887 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200888 "\U00010fff\uD800")
Victor Stinner01ada392015-10-01 21:54:51 +0200889
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000890 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700891 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200892 b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200893 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200894 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000895
Serhiy Storchaka894263b2019-06-25 11:54:18 +0300896 def test_incremental_errors(self):
897 # Test that the incremental decoder can fail with final=False.
898 # See issue #24214
899 cases = [b'\x80', b'\xBF', b'\xC0', b'\xC1', b'\xF5', b'\xF6', b'\xFF']
900 for prefix in (b'\xC2', b'\xDF', b'\xE0', b'\xE0\xA0', b'\xEF',
901 b'\xEF\xBF', b'\xF0', b'\xF0\x90', b'\xF0\x90\x80',
902 b'\xF4', b'\xF4\x8F', b'\xF4\x8F\xBF'):
903 for suffix in b'\x7F', b'\xC0':
904 cases.append(prefix + suffix)
905 cases.extend((b'\xE0\x80', b'\xE0\x9F', b'\xED\xA0\x80',
906 b'\xED\xBF\xBF', b'\xF0\x80', b'\xF0\x8F', b'\xF4\x90'))
907
908 for data in cases:
909 with self.subTest(data=data):
910 dec = codecs.getincrementaldecoder(self.encoding)()
911 self.assertRaises(UnicodeDecodeError, dec.decode, data)
912
Victor Stinnerf96418d2015-09-21 23:06:27 +0200913
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200914class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000915 encoding = "utf-7"
916
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300917 def test_ascii(self):
918 # Set D (directly encoded characters)
919 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
920 'abcdefghijklmnopqrstuvwxyz'
921 '0123456789'
922 '\'(),-./:?')
923 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
924 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
925 # Set O (optional direct characters)
926 set_o = ' !"#$%&*;<=>@[]^_`{|}'
927 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
928 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
929 # +
930 self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
931 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
932 # White spaces
933 ws = ' \t\n\r'
934 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
935 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
936 # Other ASCII characters
937 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
938 set(set_d + set_o + '+' + ws)))
939 self.assertEqual(other_ascii.encode(self.encoding),
940 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
941 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
942
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000943 def test_partial(self):
944 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200945 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000946 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200947 'a',
948 'a',
949 'a+',
950 'a+-',
951 'a+-b',
952 'a+-b',
953 'a+-b',
954 'a+-b',
955 'a+-b',
956 'a+-b\x00',
957 'a+-b\x00c',
958 'a+-b\x00c',
959 'a+-b\x00c',
960 'a+-b\x00c',
961 'a+-b\x00c',
962 'a+-b\x00c\x80',
963 'a+-b\x00c\x80d',
964 'a+-b\x00c\x80d',
965 'a+-b\x00c\x80d',
966 'a+-b\x00c\x80d',
967 'a+-b\x00c\x80d',
968 'a+-b\x00c\x80d\u0100',
969 'a+-b\x00c\x80d\u0100e',
970 'a+-b\x00c\x80d\u0100e',
971 'a+-b\x00c\x80d\u0100e',
972 'a+-b\x00c\x80d\u0100e',
973 'a+-b\x00c\x80d\u0100e',
974 'a+-b\x00c\x80d\u0100e',
975 'a+-b\x00c\x80d\u0100e',
976 'a+-b\x00c\x80d\u0100e',
977 'a+-b\x00c\x80d\u0100e\U00010000',
978 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000979 ]
980 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000981
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300982 def test_errors(self):
983 tests = [
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300984 (b'\xffb', '\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300985 (b'a\xffb', 'a\ufffdb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300986 (b'a\xff\xffb', 'a\ufffd\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300987 (b'a+IK', 'a\ufffd'),
988 (b'a+IK-b', 'a\ufffdb'),
989 (b'a+IK,b', 'a\ufffdb'),
990 (b'a+IKx', 'a\u20ac\ufffd'),
991 (b'a+IKx-b', 'a\u20ac\ufffdb'),
992 (b'a+IKwgr', 'a\u20ac\ufffd'),
993 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
994 (b'a+IKwgr,', 'a\u20ac\ufffd'),
995 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
996 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
997 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
998 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
999 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
1000 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
1001 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001002 (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
1003 (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
Zackery Spytze349bf22018-08-18 22:43:38 -06001004 (b'a+@b', 'a\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001005 ]
1006 for raw, expected in tests:
1007 with self.subTest(raw=raw):
1008 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
1009 raw, 'strict', True)
1010 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
1011
1012 def test_nonbmp(self):
1013 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1014 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1015 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001016 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1017 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1018 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1019 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1020 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1021 b'+IKwgrNgB3KA-')
1022 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1023 '\u20ac\u20ac\U000104A0')
1024 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1025 '\u20ac\u20ac\U000104A0')
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001026
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001027 def test_lone_surrogates(self):
1028 tests = [
1029 (b'a+2AE-b', 'a\ud801b'),
1030 (b'a+2AE\xffb', 'a\ufffdb'),
1031 (b'a+2AE', 'a\ufffd'),
1032 (b'a+2AEA-b', 'a\ufffdb'),
1033 (b'a+2AH-b', 'a\ufffdb'),
1034 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1035 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1036 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1037 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1038 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1039 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1040 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1041 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1042 ]
1043 for raw, expected in tests:
1044 with self.subTest(raw=raw):
1045 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001046
1047
Walter Dörwalde22d3392005-11-17 08:52:34 +00001048class UTF16ExTest(unittest.TestCase):
1049
1050 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001051 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001052
1053 def test_bad_args(self):
1054 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1055
1056class ReadBufferTest(unittest.TestCase):
1057
1058 def test_array(self):
1059 import array
1060 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001061 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +00001062 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001063 )
1064
1065 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +00001066 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +00001067
1068 def test_bad_args(self):
1069 self.assertRaises(TypeError, codecs.readbuffer_encode)
1070 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1071
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001072class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001073 encoding = "utf-8-sig"
Victor Stinner01ada392015-10-01 21:54:51 +02001074 BOM = codecs.BOM_UTF8
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001075
1076 def test_partial(self):
1077 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001078 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001079 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001080 "",
1081 "",
1082 "", # First BOM has been read and skipped
1083 "",
1084 "",
1085 "\ufeff", # Second BOM has been read and emitted
1086 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001087 "\ufeff\x00", # First byte of encoded "\xff" read
1088 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1089 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1090 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001091 "\ufeff\x00\xff\u07ff",
1092 "\ufeff\x00\xff\u07ff",
1093 "\ufeff\x00\xff\u07ff\u0800",
1094 "\ufeff\x00\xff\u07ff\u0800",
1095 "\ufeff\x00\xff\u07ff\u0800",
1096 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001097 "\ufeff\x00\xff\u07ff\u0800\uffff",
1098 "\ufeff\x00\xff\u07ff\u0800\uffff",
1099 "\ufeff\x00\xff\u07ff\u0800\uffff",
1100 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001101 ]
1102 )
1103
Thomas Wouters89f507f2006-12-13 04:49:30 +00001104 def test_bug1601501(self):
1105 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001106 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001107
Walter Dörwald3abcb012007-04-16 22:10:50 +00001108 def test_bom(self):
1109 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001110 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001111 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1112
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001113 def test_stream_bom(self):
1114 unistring = "ABC\u00A1\u2200XYZ"
1115 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1116
1117 reader = codecs.getreader("utf-8-sig")
1118 for sizehint in [None] + list(range(1, 11)) + \
1119 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001120 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001121 ostream = io.StringIO()
1122 while 1:
1123 if sizehint is not None:
1124 data = istream.read(sizehint)
1125 else:
1126 data = istream.read()
1127
1128 if not data:
1129 break
1130 ostream.write(data)
1131
1132 got = ostream.getvalue()
1133 self.assertEqual(got, unistring)
1134
1135 def test_stream_bare(self):
1136 unistring = "ABC\u00A1\u2200XYZ"
1137 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1138
1139 reader = codecs.getreader("utf-8-sig")
1140 for sizehint in [None] + list(range(1, 11)) + \
1141 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001142 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001143 ostream = io.StringIO()
1144 while 1:
1145 if sizehint is not None:
1146 data = istream.read(sizehint)
1147 else:
1148 data = istream.read()
1149
1150 if not data:
1151 break
1152 ostream.write(data)
1153
1154 got = ostream.getvalue()
1155 self.assertEqual(got, unistring)
1156
Chris A2565ede2020-03-02 01:39:50 -05001157
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001158class EscapeDecodeTest(unittest.TestCase):
1159 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001160 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001161 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001162
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001163 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001164 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001165 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001166 b = bytes([b])
1167 if b != b'\\':
1168 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001169
1170 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001171 decode = codecs.escape_decode
1172 check = coding_checker(self, decode)
1173 check(b"[\\\n]", b"[]")
1174 check(br'[\"]', b'["]')
1175 check(br"[\']", b"[']")
R David Murray110b6fe2016-09-08 15:34:08 -04001176 check(br"[\\]", b"[\\]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001177 check(br"[\a]", b"[\x07]")
1178 check(br"[\b]", b"[\x08]")
1179 check(br"[\t]", b"[\x09]")
1180 check(br"[\n]", b"[\x0a]")
1181 check(br"[\v]", b"[\x0b]")
1182 check(br"[\f]", b"[\x0c]")
1183 check(br"[\r]", b"[\x0d]")
1184 check(br"[\7]", b"[\x07]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001185 check(br"[\78]", b"[\x078]")
1186 check(br"[\41]", b"[!]")
1187 check(br"[\418]", b"[!8]")
1188 check(br"[\101]", b"[A]")
1189 check(br"[\1010]", b"[A0]")
1190 check(br"[\501]", b"[A]")
1191 check(br"[\x41]", b"[A]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001192 check(br"[\x410]", b"[A0]")
R David Murray110b6fe2016-09-08 15:34:08 -04001193 for i in range(97, 123):
1194 b = bytes([i])
1195 if b not in b'abfnrtvx':
1196 with self.assertWarns(DeprecationWarning):
1197 check(b"\\" + b, b"\\" + b)
1198 with self.assertWarns(DeprecationWarning):
1199 check(b"\\" + b.upper(), b"\\" + b.upper())
1200 with self.assertWarns(DeprecationWarning):
1201 check(br"\8", b"\\8")
1202 with self.assertWarns(DeprecationWarning):
1203 check(br"\9", b"\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03001204 with self.assertWarns(DeprecationWarning):
1205 check(b"\\\xfa", b"\\\xfa")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001206
1207 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001208 decode = codecs.escape_decode
1209 self.assertRaises(ValueError, decode, br"\x")
1210 self.assertRaises(ValueError, decode, br"[\x]")
1211 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1212 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1213 self.assertRaises(ValueError, decode, br"\x0")
1214 self.assertRaises(ValueError, decode, br"[\x0]")
1215 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1216 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001217
Victor Stinnerf96418d2015-09-21 23:06:27 +02001218
Martin v. Löwis2548c732003-04-18 10:39:54 +00001219# From RFC 3492
1220punycode_testcases = [
1221 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001222 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1223 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001224 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001225 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001226 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001227 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001228 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001229 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001230 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001231 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001232 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1233 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1234 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001235 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001236 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001237 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1238 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1239 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001240 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001241 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001242 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001243 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1244 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1245 "\u0939\u0948\u0902",
1246 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001247
1248 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001249 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001250 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1251 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001252
1253 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001254 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1255 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1256 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001257 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1258 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001259
1260 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001261 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1262 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1263 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1264 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001265 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001266
1267 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001268 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1269 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1270 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1271 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1272 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001273 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001274
1275 # (K) Vietnamese:
1276 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1277 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001278 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1279 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1280 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1281 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001282 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001283
Martin v. Löwis2548c732003-04-18 10:39:54 +00001284 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001285 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001286 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001287
Martin v. Löwis2548c732003-04-18 10:39:54 +00001288 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001289 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1290 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1291 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001292 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001293
1294 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001295 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1296 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1297 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001298 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001299
1300 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001301 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001302 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001303
1304 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001305 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1306 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001307 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001308
1309 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001310 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001311 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001312
1313 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001314 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001315 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001316
1317 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001318 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1319 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001320 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001321 ]
1322
1323for i in punycode_testcases:
1324 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001325 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001326
Victor Stinnerf96418d2015-09-21 23:06:27 +02001327
Martin v. Löwis2548c732003-04-18 10:39:54 +00001328class PunycodeTest(unittest.TestCase):
1329 def test_encode(self):
1330 for uni, puny in punycode_testcases:
1331 # Need to convert both strings to lower case, since
1332 # some of the extended encodings use upper case, but our
1333 # code produces only lower case. Converting just puny to
1334 # lower is also insufficient, since some of the input characters
1335 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001336 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001337 str(uni.encode("punycode"), "ascii").lower(),
1338 str(puny, "ascii").lower()
1339 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001340
1341 def test_decode(self):
1342 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001343 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001344 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001345 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001346
Berker Peksagba22e8f2020-02-25 06:19:03 +03001347 def test_decode_invalid(self):
1348 testcases = [
1349 (b"xn--w&", "strict", UnicodeError()),
1350 (b"xn--w&", "ignore", "xn-"),
1351 ]
1352 for puny, errors, expected in testcases:
1353 with self.subTest(puny=puny, errors=errors):
1354 if isinstance(expected, Exception):
1355 self.assertRaises(UnicodeError, puny.decode, "punycode", errors)
1356 else:
1357 self.assertEqual(puny.decode("punycode", errors), expected)
1358
Victor Stinnerf96418d2015-09-21 23:06:27 +02001359
Martin v. Löwis2548c732003-04-18 10:39:54 +00001360# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1361nameprep_tests = [
1362 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001363 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1364 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1365 b'\xb8\x8f\xef\xbb\xbf',
1366 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001367 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001368 (b'CAFE',
1369 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001370 # 3.3 Case folding 8bit U+00DF (german sharp s).
1371 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001372 (b'\xc3\x9f',
1373 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001374 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001375 (b'\xc4\xb0',
1376 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001377 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001378 (b'\xc5\x83\xcd\xba',
1379 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001380 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1381 # XXX: skip this as it fails in UCS-2 mode
1382 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1383 # 'telc\xe2\x88\x95kg\xcf\x83'),
1384 (None, None),
1385 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001386 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1387 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001388 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001389 (b'\xe1\xbe\xb7',
1390 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001391 # 3.9 Self-reverting case folding U+01F0 and normalization.
1392 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001393 (b'\xc7\xb0',
1394 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001395 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001396 (b'\xce\x90',
1397 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001398 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001399 (b'\xce\xb0',
1400 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001401 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001402 (b'\xe1\xba\x96',
1403 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001404 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001405 (b'\xe1\xbd\x96',
1406 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001407 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001408 (b' ',
1409 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001410 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001411 (b'\xc2\xa0',
1412 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001413 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001414 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001415 None),
1416 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001417 (b'\xe2\x80\x80',
1418 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001419 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001420 (b'\xe2\x80\x8b',
1421 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001422 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001423 (b'\xe3\x80\x80',
1424 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001425 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001426 (b'\x10\x7f',
1427 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001428 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001429 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001430 None),
1431 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001432 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001433 None),
1434 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001435 (b'\xef\xbb\xbf',
1436 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001437 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001438 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001439 None),
1440 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001441 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001442 None),
1443 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001444 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001445 None),
1446 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001447 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001448 None),
1449 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001450 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001451 None),
1452 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001453 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001454 None),
1455 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001456 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001457 None),
1458 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001459 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001460 None),
1461 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001462 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001463 None),
1464 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001465 (b'\xcd\x81',
1466 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001467 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001468 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001469 None),
1470 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001471 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001472 None),
1473 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001474 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001475 None),
1476 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001477 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001478 None),
1479 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001480 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001481 None),
1482 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001483 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001484 None),
1485 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001486 (b'foo\xef\xb9\xb6bar',
1487 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001488 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001489 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001490 None),
1491 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001492 (b'\xd8\xa71\xd8\xa8',
1493 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001494 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001495 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001496 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001497 # None),
1498 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001499 # 3.44 Larger test (shrinking).
1500 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001501 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1502 b'\xaa\xce\xb0\xe2\x80\x80',
1503 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001504 # 3.45 Larger test (expanding).
1505 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001506 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1507 b'\x80',
1508 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1509 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1510 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001511 ]
1512
1513
1514class NameprepTest(unittest.TestCase):
1515 def test_nameprep(self):
1516 from encodings.idna import nameprep
1517 for pos, (orig, prepped) in enumerate(nameprep_tests):
1518 if orig is None:
1519 # Skipped
1520 continue
1521 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001522 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001523 if prepped is None:
1524 # Input contains prohibited characters
1525 self.assertRaises(UnicodeError, nameprep, orig)
1526 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001527 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001528 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001529 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001530 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001531 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001532
Victor Stinnerf96418d2015-09-21 23:06:27 +02001533
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001534class IDNACodecTest(unittest.TestCase):
1535 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001536 self.assertEqual(str(b"python.org", "idna"), "python.org")
1537 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1538 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1539 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001540
1541 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001542 self.assertEqual("python.org".encode("idna"), b"python.org")
1543 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1544 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1545 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001546
Martin v. Löwis8b595142005-08-25 11:03:38 +00001547 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001548 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001549 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001550 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001551
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001552 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001553 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001554 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001555 "python.org"
1556 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001557 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001558 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001559 "python.org."
1560 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001561 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001562 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001563 "pyth\xf6n.org."
1564 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001565 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001566 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001567 "pyth\xf6n.org."
1568 )
1569
1570 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001571 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1572 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1573 self.assertEqual(decoder.decode(b"rg"), "")
1574 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001575
1576 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001577 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1578 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1579 self.assertEqual(decoder.decode(b"rg."), "org.")
1580 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001581
1582 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001583 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001584 b"".join(codecs.iterencode("python.org", "idna")),
1585 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001586 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001587 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001588 b"".join(codecs.iterencode("python.org.", "idna")),
1589 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001590 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001591 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001592 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1593 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001594 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001595 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001596 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1597 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001598 )
1599
1600 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001601 self.assertEqual(encoder.encode("\xe4x"), b"")
1602 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1603 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001604
1605 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001606 self.assertEqual(encoder.encode("\xe4x"), b"")
1607 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1608 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001609
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001610 def test_errors(self):
1611 """Only supports "strict" error handler"""
1612 "python.org".encode("idna", "strict")
1613 b"python.org".decode("idna", "strict")
1614 for errors in ("ignore", "replace", "backslashreplace",
1615 "surrogateescape"):
1616 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1617 self.assertRaises(Exception,
1618 b"python.org".decode, "idna", errors)
1619
Victor Stinnerf96418d2015-09-21 23:06:27 +02001620
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001621class CodecsModuleTest(unittest.TestCase):
1622
1623 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001624 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1625 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001626 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001627 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001628 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001629
Victor Stinnera57dfd02014-05-14 17:13:14 +02001630 # test keywords
1631 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1632 '\xe4\xf6\xfc')
1633 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1634 '[]')
1635
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001636 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001637 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1638 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001639 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001640 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001641 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001642 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001643
Victor Stinnera57dfd02014-05-14 17:13:14 +02001644 # test keywords
1645 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1646 b'\xe4\xf6\xfc')
1647 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1648 b'[]')
1649
Walter Dörwald063e1e82004-10-28 13:04:26 +00001650 def test_register(self):
1651 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001652 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001653
1654 def test_lookup(self):
1655 self.assertRaises(TypeError, codecs.lookup)
1656 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001657 self.assertRaises(LookupError, codecs.lookup, " ")
1658
1659 def test_getencoder(self):
1660 self.assertRaises(TypeError, codecs.getencoder)
1661 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1662
1663 def test_getdecoder(self):
1664 self.assertRaises(TypeError, codecs.getdecoder)
1665 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1666
1667 def test_getreader(self):
1668 self.assertRaises(TypeError, codecs.getreader)
1669 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1670
1671 def test_getwriter(self):
1672 self.assertRaises(TypeError, codecs.getwriter)
1673 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001674
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001675 def test_lookup_issue1813(self):
1676 # Issue #1813: under Turkish locales, lookup of some codecs failed
1677 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001678 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001679 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1680 try:
1681 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1682 except locale.Error:
1683 # Unsupported locale on this system
1684 self.skipTest('test needs Turkish locale')
1685 c = codecs.lookup('ASCII')
1686 self.assertEqual(c.name, 'ascii')
1687
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001688 def test_all(self):
1689 api = (
1690 "encode", "decode",
1691 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1692 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1693 "getencoder", "getdecoder", "getincrementalencoder",
1694 "getincrementaldecoder", "getreader", "getwriter",
1695 "register_error", "lookup_error",
1696 "strict_errors", "replace_errors", "ignore_errors",
1697 "xmlcharrefreplace_errors", "backslashreplace_errors",
1698 "namereplace_errors",
1699 "open", "EncodedFile",
1700 "iterencode", "iterdecode",
1701 "BOM", "BOM_BE", "BOM_LE",
1702 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1703 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1704 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1705 "StreamReaderWriter", "StreamRecoder",
1706 )
1707 self.assertCountEqual(api, codecs.__all__)
1708 for api in codecs.__all__:
1709 getattr(codecs, api)
1710
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001711 def test_open(self):
1712 self.addCleanup(support.unlink, support.TESTFN)
1713 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1714 with self.subTest(mode), \
1715 codecs.open(support.TESTFN, mode, 'ascii') as file:
1716 self.assertIsInstance(file, codecs.StreamReaderWriter)
1717
1718 def test_undefined(self):
1719 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1720 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1721 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1722 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1723 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1724 self.assertRaises(UnicodeError,
1725 codecs.encode, 'abc', 'undefined', errors)
1726 self.assertRaises(UnicodeError,
1727 codecs.decode, b'abc', 'undefined', errors)
1728
Chris A2565ede2020-03-02 01:39:50 -05001729 def test_file_closes_if_lookup_error_raised(self):
1730 mock_open = mock.mock_open()
1731 with mock.patch('builtins.open', mock_open) as file:
1732 with self.assertRaises(LookupError):
1733 codecs.open(support.TESTFN, 'wt', 'invalid-encoding')
1734
1735 file().close.assert_called()
1736
Victor Stinnerf96418d2015-09-21 23:06:27 +02001737
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001738class StreamReaderTest(unittest.TestCase):
1739
1740 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001741 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001742 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001743
1744 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001745 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001746 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001747
Victor Stinnerf96418d2015-09-21 23:06:27 +02001748
Thomas Wouters89f507f2006-12-13 04:49:30 +00001749class EncodedFileTest(unittest.TestCase):
1750
1751 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001752 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001753 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001754 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001755
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001756 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001757 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001758 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001759 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001760
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001761all_unicode_encodings = [
1762 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001763 "big5",
1764 "big5hkscs",
1765 "charmap",
1766 "cp037",
1767 "cp1006",
1768 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001769 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001770 "cp1140",
1771 "cp1250",
1772 "cp1251",
1773 "cp1252",
1774 "cp1253",
1775 "cp1254",
1776 "cp1255",
1777 "cp1256",
1778 "cp1257",
1779 "cp1258",
1780 "cp424",
1781 "cp437",
1782 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001783 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001784 "cp737",
1785 "cp775",
1786 "cp850",
1787 "cp852",
1788 "cp855",
1789 "cp856",
1790 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001791 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001792 "cp860",
1793 "cp861",
1794 "cp862",
1795 "cp863",
1796 "cp864",
1797 "cp865",
1798 "cp866",
1799 "cp869",
1800 "cp874",
1801 "cp875",
1802 "cp932",
1803 "cp949",
1804 "cp950",
1805 "euc_jis_2004",
1806 "euc_jisx0213",
1807 "euc_jp",
1808 "euc_kr",
1809 "gb18030",
1810 "gb2312",
1811 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001812 "hp_roman8",
1813 "hz",
1814 "idna",
1815 "iso2022_jp",
1816 "iso2022_jp_1",
1817 "iso2022_jp_2",
1818 "iso2022_jp_2004",
1819 "iso2022_jp_3",
1820 "iso2022_jp_ext",
1821 "iso2022_kr",
1822 "iso8859_1",
1823 "iso8859_10",
1824 "iso8859_11",
1825 "iso8859_13",
1826 "iso8859_14",
1827 "iso8859_15",
1828 "iso8859_16",
1829 "iso8859_2",
1830 "iso8859_3",
1831 "iso8859_4",
1832 "iso8859_5",
1833 "iso8859_6",
1834 "iso8859_7",
1835 "iso8859_8",
1836 "iso8859_9",
1837 "johab",
1838 "koi8_r",
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03001839 "koi8_t",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001840 "koi8_u",
Serhiy Storchakaad8a1c32015-05-12 23:16:55 +03001841 "kz1048",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001842 "latin_1",
1843 "mac_cyrillic",
1844 "mac_greek",
1845 "mac_iceland",
1846 "mac_latin2",
1847 "mac_roman",
1848 "mac_turkish",
1849 "palmos",
1850 "ptcp154",
1851 "punycode",
1852 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001853 "shift_jis",
1854 "shift_jis_2004",
1855 "shift_jisx0213",
1856 "tis_620",
1857 "unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001858 "utf_16",
1859 "utf_16_be",
1860 "utf_16_le",
1861 "utf_7",
1862 "utf_8",
1863]
1864
1865if hasattr(codecs, "mbcs_encode"):
1866 all_unicode_encodings.append("mbcs")
Steve Dowerf5aba582016-09-06 19:42:27 -07001867if hasattr(codecs, "oem_encode"):
1868 all_unicode_encodings.append("oem")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001869
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001870# The following encoding is not tested, because it's not supposed
1871# to work:
1872# "undefined"
1873
1874# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001875broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001876 "punycode",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001877]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001878
Victor Stinnerf96418d2015-09-21 23:06:27 +02001879
Walter Dörwald3abcb012007-04-16 22:10:50 +00001880class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001881 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001882 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001883 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001884 name = codecs.lookup(encoding).name
1885 if encoding.endswith("_codec"):
1886 name += "_codec"
1887 elif encoding == "latin_1":
1888 name = "latin_1"
1889 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001890
Inada Naoki6a16b182019-03-18 15:44:11 +09001891 (b, size) = codecs.getencoder(encoding)(s)
1892 self.assertEqual(size, len(s), "encoding=%r" % encoding)
1893 (chars, size) = codecs.getdecoder(encoding)(b)
1894 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001895
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001896 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001897 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001898 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001899 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001900 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001901 for c in s:
1902 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001903 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001904 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001905 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001906 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001907 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001908 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001909 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001910 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001911 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001912 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001913
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001914 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001915 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001916 try:
1917 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001918 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001919 pass
1920 else:
1921 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001922 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001923 for c in s:
1924 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001925 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001926 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001927 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001928 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001929 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001930 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001931 self.assertEqual(decodedresult, s,
1932 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001933
1934 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001935 result = "".join(codecs.iterdecode(
1936 codecs.iterencode(s, encoding), encoding))
1937 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001938
1939 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001940 result = "".join(codecs.iterdecode(
1941 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001942 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001943
Victor Stinner554f3f02010-06-16 23:33:54 +00001944 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001945 # check incremental decoder/encoder with errors argument
1946 try:
1947 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001948 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001949 pass
1950 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001951 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001952 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001953 decodedresult = "".join(decoder.decode(bytes([c]))
1954 for c in encodedresult)
1955 self.assertEqual(decodedresult, s,
1956 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001957
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001958 @support.cpython_only
1959 def test_basics_capi(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001960 s = "abc123" # all codecs should be able to encode these
1961 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001962 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001963 # check incremental decoder/encoder (fetched via the C API)
1964 try:
Victor Stinner3d4226a2018-08-29 22:21:32 +02001965 cencoder = _testcapi.codec_incrementalencoder(encoding)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001966 except LookupError: # no IncrementalEncoder
1967 pass
1968 else:
1969 # check C API
1970 encodedresult = b""
1971 for c in s:
1972 encodedresult += cencoder.encode(c)
1973 encodedresult += cencoder.encode("", True)
Victor Stinner3d4226a2018-08-29 22:21:32 +02001974 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001975 decodedresult = ""
1976 for c in encodedresult:
1977 decodedresult += cdecoder.decode(bytes([c]))
1978 decodedresult += cdecoder.decode(b"", True)
1979 self.assertEqual(decodedresult, s,
1980 "encoding=%r" % encoding)
1981
1982 if encoding not in ("idna", "mbcs"):
1983 # check incremental decoder/encoder with errors argument
1984 try:
Victor Stinner3d4226a2018-08-29 22:21:32 +02001985 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001986 except LookupError: # no IncrementalEncoder
1987 pass
1988 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001989 encodedresult = b"".join(cencoder.encode(c) for c in s)
Victor Stinner3d4226a2018-08-29 22:21:32 +02001990 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001991 decodedresult = "".join(cdecoder.decode(bytes([c]))
1992 for c in encodedresult)
1993 self.assertEqual(decodedresult, s,
1994 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001995
Walter Dörwald729c31f2005-03-14 19:06:30 +00001996 def test_seek(self):
1997 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001998 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001999 for encoding in all_unicode_encodings:
2000 if encoding == "idna": # FIXME: See SF bug #1163178
2001 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002002 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00002003 continue
Victor Stinner05010702011-05-27 16:50:40 +02002004 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00002005 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00002006 # Test that calling seek resets the internal codec state and buffers
2007 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002008 data = reader.read()
2009 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00002010
Walter Dörwalde22d3392005-11-17 08:52:34 +00002011 def test_bad_decode_args(self):
2012 for encoding in all_unicode_encodings:
2013 decoder = codecs.getdecoder(encoding)
2014 self.assertRaises(TypeError, decoder)
2015 if encoding not in ("idna", "punycode"):
2016 self.assertRaises(TypeError, decoder, 42)
2017
2018 def test_bad_encode_args(self):
2019 for encoding in all_unicode_encodings:
2020 encoder = codecs.getencoder(encoding)
Inada Naoki6a16b182019-03-18 15:44:11 +09002021 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002022
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002023 def test_encoding_map_type_initialized(self):
2024 from encodings import cp1140
2025 # This used to crash, we are only verifying there's no crash.
2026 table_type = type(cp1140.encoding_table)
2027 self.assertEqual(table_type, table_type)
2028
Walter Dörwald3abcb012007-04-16 22:10:50 +00002029 def test_decoder_state(self):
2030 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002031 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002032 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002033 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002034 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2035 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2036
Victor Stinnerf96418d2015-09-21 23:06:27 +02002037
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002038class CharmapTest(unittest.TestCase):
2039 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002040 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002041 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002042 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002043 )
2044
Ezio Melottib3aedd42010-11-20 19:04:17 +00002045 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002046 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2047 ("\U0010FFFFbc", 3)
2048 )
2049
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002050 self.assertRaises(UnicodeDecodeError,
2051 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2052 )
2053
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002054 self.assertRaises(UnicodeDecodeError,
2055 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2056 )
2057
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002058 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002059 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002060 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002061 )
2062
Ezio Melottib3aedd42010-11-20 19:04:17 +00002063 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002064 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002065 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002066 )
2067
Ezio Melottib3aedd42010-11-20 19:04:17 +00002068 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002069 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2070 ("ab\\x02", 3)
2071 )
2072
2073 self.assertEqual(
2074 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2075 ("ab\\x02", 3)
2076 )
2077
2078 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002079 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002080 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002081 )
2082
Ezio Melottib3aedd42010-11-20 19:04:17 +00002083 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002084 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002085 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002086 )
2087
Guido van Rossum805365e2007-05-07 22:24:25 +00002088 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002089 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002090 codecs.charmap_decode(allbytes, "ignore", ""),
2091 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002092 )
2093
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002094 def test_decode_with_int2str_map(self):
2095 self.assertEqual(
2096 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2097 {0: 'a', 1: 'b', 2: 'c'}),
2098 ("abc", 3)
2099 )
2100
2101 self.assertEqual(
2102 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2103 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2104 ("AaBbCc", 3)
2105 )
2106
2107 self.assertEqual(
2108 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2109 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2110 ("\U0010FFFFbc", 3)
2111 )
2112
2113 self.assertEqual(
2114 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2115 {0: 'a', 1: 'b', 2: ''}),
2116 ("ab", 3)
2117 )
2118
2119 self.assertRaises(UnicodeDecodeError,
2120 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2121 {0: 'a', 1: 'b'}
2122 )
2123
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002124 self.assertRaises(UnicodeDecodeError,
2125 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2126 {0: 'a', 1: 'b', 2: None}
2127 )
2128
2129 # Issue #14850
2130 self.assertRaises(UnicodeDecodeError,
2131 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2132 {0: 'a', 1: 'b', 2: '\ufffe'}
2133 )
2134
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002135 self.assertEqual(
2136 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2137 {0: 'a', 1: 'b'}),
2138 ("ab\ufffd", 3)
2139 )
2140
2141 self.assertEqual(
2142 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2143 {0: 'a', 1: 'b', 2: None}),
2144 ("ab\ufffd", 3)
2145 )
2146
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002147 # Issue #14850
2148 self.assertEqual(
2149 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2150 {0: 'a', 1: 'b', 2: '\ufffe'}),
2151 ("ab\ufffd", 3)
2152 )
2153
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002154 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002155 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2156 {0: 'a', 1: 'b'}),
2157 ("ab\\x02", 3)
2158 )
2159
2160 self.assertEqual(
2161 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2162 {0: 'a', 1: 'b', 2: None}),
2163 ("ab\\x02", 3)
2164 )
2165
2166 # Issue #14850
2167 self.assertEqual(
2168 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2169 {0: 'a', 1: 'b', 2: '\ufffe'}),
2170 ("ab\\x02", 3)
2171 )
2172
2173 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002174 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2175 {0: 'a', 1: 'b'}),
2176 ("ab", 3)
2177 )
2178
2179 self.assertEqual(
2180 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2181 {0: 'a', 1: 'b', 2: None}),
2182 ("ab", 3)
2183 )
2184
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002185 # Issue #14850
2186 self.assertEqual(
2187 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2188 {0: 'a', 1: 'b', 2: '\ufffe'}),
2189 ("ab", 3)
2190 )
2191
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002192 allbytes = bytes(range(256))
2193 self.assertEqual(
2194 codecs.charmap_decode(allbytes, "ignore", {}),
2195 ("", len(allbytes))
2196 )
2197
2198 def test_decode_with_int2int_map(self):
2199 a = ord('a')
2200 b = ord('b')
2201 c = ord('c')
2202
2203 self.assertEqual(
2204 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2205 {0: a, 1: b, 2: c}),
2206 ("abc", 3)
2207 )
2208
2209 # Issue #15379
2210 self.assertEqual(
2211 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2212 {0: 0x10FFFF, 1: b, 2: c}),
2213 ("\U0010FFFFbc", 3)
2214 )
2215
Antoine Pitroua1f76552012-09-23 20:00:04 +02002216 self.assertEqual(
2217 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2218 {0: sys.maxunicode, 1: b, 2: c}),
2219 (chr(sys.maxunicode) + "bc", 3)
2220 )
2221
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002222 self.assertRaises(TypeError,
2223 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002224 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002225 )
2226
2227 self.assertRaises(UnicodeDecodeError,
2228 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2229 {0: a, 1: b},
2230 )
2231
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002232 self.assertRaises(UnicodeDecodeError,
2233 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2234 {0: a, 1: b, 2: 0xFFFE},
2235 )
2236
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002237 self.assertEqual(
2238 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2239 {0: a, 1: b}),
2240 ("ab\ufffd", 3)
2241 )
2242
2243 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002244 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2245 {0: a, 1: b, 2: 0xFFFE}),
2246 ("ab\ufffd", 3)
2247 )
2248
2249 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002250 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2251 {0: a, 1: b}),
2252 ("ab\\x02", 3)
2253 )
2254
2255 self.assertEqual(
2256 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2257 {0: a, 1: b, 2: 0xFFFE}),
2258 ("ab\\x02", 3)
2259 )
2260
2261 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002262 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2263 {0: a, 1: b}),
2264 ("ab", 3)
2265 )
2266
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002267 self.assertEqual(
2268 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2269 {0: a, 1: b, 2: 0xFFFE}),
2270 ("ab", 3)
2271 )
2272
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002273
Thomas Wouters89f507f2006-12-13 04:49:30 +00002274class WithStmtTest(unittest.TestCase):
2275 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002276 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002277 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2278 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002279 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002280
2281 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002282 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002283 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002284 with codecs.StreamReaderWriter(f, info.streamreader,
2285 info.streamwriter, 'strict') as srw:
2286 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002287
Victor Stinnerf96418d2015-09-21 23:06:27 +02002288
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002289class TypesTest(unittest.TestCase):
2290 def test_decode_unicode(self):
2291 # Most decoders don't accept unicode input
2292 decoders = [
2293 codecs.utf_7_decode,
2294 codecs.utf_8_decode,
2295 codecs.utf_16_le_decode,
2296 codecs.utf_16_be_decode,
2297 codecs.utf_16_ex_decode,
2298 codecs.utf_32_decode,
2299 codecs.utf_32_le_decode,
2300 codecs.utf_32_be_decode,
2301 codecs.utf_32_ex_decode,
2302 codecs.latin_1_decode,
2303 codecs.ascii_decode,
2304 codecs.charmap_decode,
2305 ]
2306 if hasattr(codecs, "mbcs_decode"):
2307 decoders.append(codecs.mbcs_decode)
2308 for decoder in decoders:
2309 self.assertRaises(TypeError, decoder, "xxx")
2310
2311 def test_unicode_escape(self):
Martin Panter119e5022016-04-16 09:28:57 +00002312 # Escape-decoding a unicode string is supported and gives the same
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002313 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002314 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2315 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2316 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2317 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002318
Victor Stinnere3b47152011-12-09 20:49:49 +01002319 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2320 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002321 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2322 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002323
2324 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2325 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002326 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2327 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002328
Serhiy Storchakad6793772013-01-29 10:20:44 +02002329
2330class UnicodeEscapeTest(unittest.TestCase):
2331 def test_empty(self):
2332 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2333 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2334
2335 def test_raw_encode(self):
2336 encode = codecs.unicode_escape_encode
2337 for b in range(32, 127):
2338 if b != b'\\'[0]:
2339 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2340
2341 def test_raw_decode(self):
2342 decode = codecs.unicode_escape_decode
2343 for b in range(256):
2344 if b != b'\\'[0]:
2345 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2346
2347 def test_escape_encode(self):
2348 encode = codecs.unicode_escape_encode
2349 check = coding_checker(self, encode)
2350 check('\t', br'\t')
2351 check('\n', br'\n')
2352 check('\r', br'\r')
2353 check('\\', br'\\')
2354 for b in range(32):
2355 if chr(b) not in '\t\n\r':
2356 check(chr(b), ('\\x%02x' % b).encode())
2357 for b in range(127, 256):
2358 check(chr(b), ('\\x%02x' % b).encode())
2359 check('\u20ac', br'\u20ac')
2360 check('\U0001d120', br'\U0001d120')
2361
2362 def test_escape_decode(self):
2363 decode = codecs.unicode_escape_decode
2364 check = coding_checker(self, decode)
2365 check(b"[\\\n]", "[]")
2366 check(br'[\"]', '["]')
2367 check(br"[\']", "[']")
2368 check(br"[\\]", r"[\]")
2369 check(br"[\a]", "[\x07]")
2370 check(br"[\b]", "[\x08]")
2371 check(br"[\t]", "[\x09]")
2372 check(br"[\n]", "[\x0a]")
2373 check(br"[\v]", "[\x0b]")
2374 check(br"[\f]", "[\x0c]")
2375 check(br"[\r]", "[\x0d]")
2376 check(br"[\7]", "[\x07]")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002377 check(br"[\78]", "[\x078]")
2378 check(br"[\41]", "[!]")
2379 check(br"[\418]", "[!8]")
2380 check(br"[\101]", "[A]")
2381 check(br"[\1010]", "[A0]")
2382 check(br"[\x41]", "[A]")
2383 check(br"[\x410]", "[A0]")
2384 check(br"\u20ac", "\u20ac")
2385 check(br"\U0001d120", "\U0001d120")
R David Murray110b6fe2016-09-08 15:34:08 -04002386 for i in range(97, 123):
2387 b = bytes([i])
2388 if b not in b'abfnrtuvx':
2389 with self.assertWarns(DeprecationWarning):
2390 check(b"\\" + b, "\\" + chr(i))
2391 if b.upper() not in b'UN':
2392 with self.assertWarns(DeprecationWarning):
2393 check(b"\\" + b.upper(), "\\" + chr(i-32))
2394 with self.assertWarns(DeprecationWarning):
2395 check(br"\8", "\\8")
2396 with self.assertWarns(DeprecationWarning):
2397 check(br"\9", "\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03002398 with self.assertWarns(DeprecationWarning):
2399 check(b"\\\xfa", "\\\xfa")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002400
2401 def test_decode_errors(self):
2402 decode = codecs.unicode_escape_decode
2403 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2404 for i in range(d):
2405 self.assertRaises(UnicodeDecodeError, decode,
2406 b"\\" + c + b"0"*i)
2407 self.assertRaises(UnicodeDecodeError, decode,
2408 b"[\\" + c + b"0"*i + b"]")
2409 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2410 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2411 self.assertEqual(decode(data, "replace"),
2412 ("[\ufffd]\ufffd", len(data)))
2413 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2414 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2415 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2416
2417
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002418class RawUnicodeEscapeTest(unittest.TestCase):
2419 def test_empty(self):
2420 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2421 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2422
2423 def test_raw_encode(self):
2424 encode = codecs.raw_unicode_escape_encode
2425 for b in range(256):
2426 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2427
2428 def test_raw_decode(self):
2429 decode = codecs.raw_unicode_escape_decode
2430 for b in range(256):
2431 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2432
2433 def test_escape_encode(self):
2434 encode = codecs.raw_unicode_escape_encode
2435 check = coding_checker(self, encode)
2436 for b in range(256):
2437 if b not in b'uU':
2438 check('\\' + chr(b), b'\\' + bytes([b]))
2439 check('\u20ac', br'\u20ac')
2440 check('\U0001d120', br'\U0001d120')
2441
2442 def test_escape_decode(self):
2443 decode = codecs.raw_unicode_escape_decode
2444 check = coding_checker(self, decode)
2445 for b in range(256):
2446 if b not in b'uU':
2447 check(b'\\' + bytes([b]), '\\' + chr(b))
2448 check(br"\u20ac", "\u20ac")
2449 check(br"\U0001d120", "\U0001d120")
2450
2451 def test_decode_errors(self):
2452 decode = codecs.raw_unicode_escape_decode
2453 for c, d in (b'u', 4), (b'U', 4):
2454 for i in range(d):
2455 self.assertRaises(UnicodeDecodeError, decode,
2456 b"\\" + c + b"0"*i)
2457 self.assertRaises(UnicodeDecodeError, decode,
2458 b"[\\" + c + b"0"*i + b"]")
2459 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2460 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2461 self.assertEqual(decode(data, "replace"),
2462 ("[\ufffd]\ufffd", len(data)))
2463 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2464 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2465 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2466
2467
Berker Peksag4a72a7b2016-09-16 17:31:06 +03002468class EscapeEncodeTest(unittest.TestCase):
2469
2470 def test_escape_encode(self):
2471 tests = [
2472 (b'', (b'', 0)),
2473 (b'foobar', (b'foobar', 6)),
2474 (b'spam\0eggs', (b'spam\\x00eggs', 9)),
2475 (b'a\'b', (b"a\\'b", 3)),
2476 (b'b\\c', (b'b\\\\c', 3)),
2477 (b'c\nd', (b'c\\nd', 3)),
2478 (b'd\re', (b'd\\re', 3)),
2479 (b'f\x7fg', (b'f\\x7fg', 3)),
2480 ]
2481 for data, output in tests:
2482 with self.subTest(data=data):
2483 self.assertEqual(codecs.escape_encode(data), output)
2484 self.assertRaises(TypeError, codecs.escape_encode, 'spam')
2485 self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam'))
2486
2487
Martin v. Löwis43c57782009-05-10 08:15:24 +00002488class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002489
2490 def test_utf8(self):
2491 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002492 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002493 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002494 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002495 b"foo\x80bar")
2496 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002497 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002498 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002499 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002500 b"\xed\xb0\x80")
2501
2502 def test_ascii(self):
2503 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002504 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002505 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002506 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002507 b"foo\x80bar")
2508
2509 def test_charmap(self):
2510 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002511 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002512 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002513 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002514 b"foo\xa5bar")
2515
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002516 def test_latin1(self):
2517 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002518 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002519 b"\xe4\xeb\xef\xf6\xfc")
2520
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002521
Victor Stinner3fed0872010-05-22 02:16:27 +00002522class BomTest(unittest.TestCase):
2523 def test_seek0(self):
2524 data = "1234567890"
2525 tests = ("utf-16",
2526 "utf-16-le",
2527 "utf-16-be",
2528 "utf-32",
2529 "utf-32-le",
2530 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002531 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002532 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002533 # Check if the BOM is written only once
2534 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002535 f.write(data)
2536 f.write(data)
2537 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002538 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002539 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002540 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002541
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002542 # Check that the BOM is written after a seek(0)
2543 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2544 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002545 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002546 f.seek(0)
2547 f.write(data)
2548 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002549 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002550
2551 # (StreamWriter) Check that the BOM is written after a seek(0)
2552 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002553 f.writer.write(data[0])
2554 self.assertNotEqual(f.writer.tell(), 0)
2555 f.writer.seek(0)
2556 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002557 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002558 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002559
Victor Stinner05010702011-05-27 16:50:40 +02002560 # Check that the BOM is not written after a seek() at a position
2561 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002562 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2563 f.write(data)
2564 f.seek(f.tell())
2565 f.write(data)
2566 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002567 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002568
Victor Stinner05010702011-05-27 16:50:40 +02002569 # (StreamWriter) Check that the BOM is not written after a seek()
2570 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002571 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002572 f.writer.write(data)
2573 f.writer.seek(f.writer.tell())
2574 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002575 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002576 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002577
Victor Stinner3fed0872010-05-22 02:16:27 +00002578
Georg Brandl02524622010-12-02 18:06:51 +00002579bytes_transform_encodings = [
2580 "base64_codec",
2581 "uu_codec",
2582 "quopri_codec",
2583 "hex_codec",
2584]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002585
2586transform_aliases = {
2587 "base64_codec": ["base64", "base_64"],
2588 "uu_codec": ["uu"],
2589 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2590 "hex_codec": ["hex"],
2591 "rot_13": ["rot13"],
2592}
2593
Georg Brandl02524622010-12-02 18:06:51 +00002594try:
2595 import zlib
2596except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002597 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002598else:
2599 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002600 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002601try:
2602 import bz2
2603except ImportError:
2604 pass
2605else:
2606 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002607 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002608
Victor Stinnerf96418d2015-09-21 23:06:27 +02002609
Georg Brandl02524622010-12-02 18:06:51 +00002610class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002611
Georg Brandl02524622010-12-02 18:06:51 +00002612 def test_basics(self):
2613 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002614 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002615 with self.subTest(encoding=encoding):
2616 # generic codecs interface
2617 (o, size) = codecs.getencoder(encoding)(binput)
2618 self.assertEqual(size, len(binput))
2619 (i, size) = codecs.getdecoder(encoding)(o)
2620 self.assertEqual(size, len(o))
2621 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002622
Georg Brandl02524622010-12-02 18:06:51 +00002623 def test_read(self):
2624 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002625 with self.subTest(encoding=encoding):
2626 sin = codecs.encode(b"\x80", encoding)
2627 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2628 sout = reader.read()
2629 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002630
2631 def test_readline(self):
2632 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002633 with self.subTest(encoding=encoding):
2634 sin = codecs.encode(b"\x80", encoding)
2635 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2636 sout = reader.readline()
2637 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002638
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002639 def test_buffer_api_usage(self):
2640 # We check all the transform codecs accept memoryview input
2641 # for encoding and decoding
2642 # and also that they roundtrip correctly
2643 original = b"12345\x80"
2644 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002645 with self.subTest(encoding=encoding):
2646 data = original
2647 view = memoryview(data)
2648 data = codecs.encode(data, encoding)
2649 view_encoded = codecs.encode(view, encoding)
2650 self.assertEqual(view_encoded, data)
2651 view = memoryview(data)
2652 data = codecs.decode(data, encoding)
2653 self.assertEqual(data, original)
2654 view_decoded = codecs.decode(view, encoding)
2655 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002656
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002657 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002658 # Check binary -> binary codecs give a good error for str input
2659 bad_input = "bad input type"
2660 for encoding in bytes_transform_encodings:
2661 with self.subTest(encoding=encoding):
R David Murray44b548d2016-09-08 13:59:53 -04002662 fmt = (r"{!r} is not a text encoding; "
2663 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002664 msg = fmt.format(encoding)
2665 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002666 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002667 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002668
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002669 def test_text_to_binary_blacklists_text_transforms(self):
2670 # Check str.encode gives a good error message for str -> str codecs
2671 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002672 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002673 with self.assertRaisesRegex(LookupError, msg):
2674 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002675
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002676 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002677 # Check bytes.decode and bytearray.decode give a good error
2678 # message for binary -> binary codecs
2679 data = b"encode first to ensure we meet any format restrictions"
2680 for encoding in bytes_transform_encodings:
2681 with self.subTest(encoding=encoding):
2682 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002683 fmt = (r"{!r} is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002684 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002685 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002686 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002687 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002688 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002689 bytearray(encoded_data).decode(encoding)
2690
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002691 def test_binary_to_text_blacklists_text_transforms(self):
2692 # Check str -> str codec gives a good error for binary input
2693 for bad_input in (b"immutable", bytearray(b"mutable")):
2694 with self.subTest(bad_input=bad_input):
2695 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002696 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002697 with self.assertRaisesRegex(LookupError, msg) as failure:
2698 bad_input.decode("rot_13")
2699 self.assertIsNone(failure.exception.__cause__)
2700
Zachary Wareefa2e042013-12-30 14:54:11 -06002701 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002702 def test_custom_zlib_error_is_wrapped(self):
2703 # Check zlib codec gives a good error for malformed input
2704 msg = "^decoding with 'zlib_codec' codec failed"
2705 with self.assertRaisesRegex(Exception, msg) as failure:
2706 codecs.decode(b"hello", "zlib_codec")
2707 self.assertIsInstance(failure.exception.__cause__,
2708 type(failure.exception))
2709
2710 def test_custom_hex_error_is_wrapped(self):
2711 # Check hex codec gives a good error for malformed input
2712 msg = "^decoding with 'hex_codec' codec failed"
2713 with self.assertRaisesRegex(Exception, msg) as failure:
2714 codecs.decode(b"hello", "hex_codec")
2715 self.assertIsInstance(failure.exception.__cause__,
2716 type(failure.exception))
2717
2718 # Unfortunately, the bz2 module throws OSError, which the codec
2719 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002720
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002721 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2722 def test_aliases(self):
2723 for codec_name, aliases in transform_aliases.items():
2724 expected_name = codecs.lookup(codec_name).name
2725 for alias in aliases:
2726 with self.subTest(alias=alias):
2727 info = codecs.lookup(alias)
2728 self.assertEqual(info.name, expected_name)
2729
Martin Panter06171bd2015-09-12 00:34:28 +00002730 def test_quopri_stateless(self):
2731 # Should encode with quotetabs=True
2732 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2733 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2734 # But should still support unescaped tabs and spaces
2735 unescaped = b"space tab eol\n"
2736 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2737
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002738 def test_uu_invalid(self):
2739 # Missing "begin" line
2740 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2741
Nick Coghlan8b097b42013-11-13 23:49:21 +10002742
2743# The codec system tries to wrap exceptions in order to ensure the error
2744# mentions the operation being performed and the codec involved. We
2745# currently *only* want this to happen for relatively stateless
2746# exceptions, where the only significant information they contain is their
2747# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002748
2749# Use a local codec registry to avoid appearing to leak objects when
Martin Panter119e5022016-04-16 09:28:57 +00002750# registering multiple search functions
Nick Coghlan4e553e22013-11-16 00:35:34 +10002751_TEST_CODECS = {}
2752
2753def _get_test_codec(codec_name):
2754 return _TEST_CODECS.get(codec_name)
2755codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2756
Nick Coghlan8fad1672014-09-15 23:50:44 +12002757try:
2758 # Issue #22166: Also need to clear the internal cache in CPython
2759 from _codecs import _forget_codec
2760except ImportError:
2761 def _forget_codec(codec_name):
2762 pass
2763
2764
Nick Coghlan8b097b42013-11-13 23:49:21 +10002765class ExceptionChainingTest(unittest.TestCase):
2766
2767 def setUp(self):
2768 # There's no way to unregister a codec search function, so we just
2769 # ensure we render this one fairly harmless after the test
2770 # case finishes by using the test case repr as the codec name
2771 # The codecs module normalizes codec names, although this doesn't
2772 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002773 # We also make sure we use a truly unique id for the custom codec
2774 # to avoid issues with the codec cache when running these tests
2775 # multiple times (e.g. when hunting for refleaks)
2776 unique_id = repr(self) + str(id(self))
2777 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2778
2779 # We store the object to raise on the instance because of a bad
2780 # interaction between the codec caching (which means we can't
2781 # recreate the codec entry) and regrtest refleak hunting (which
2782 # runs the same test instance multiple times). This means we
2783 # need to ensure the codecs call back in to the instance to find
2784 # out which exception to raise rather than binding them in a
2785 # closure to an object that may change on the next run
2786 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002787
Nick Coghlan4e553e22013-11-16 00:35:34 +10002788 def tearDown(self):
2789 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002790 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2791 encodings._cache.pop(self.codec_name, None)
2792 try:
2793 _forget_codec(self.codec_name)
2794 except KeyError:
2795 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002796
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002797 def set_codec(self, encode, decode):
2798 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002799 name=self.codec_name)
2800 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002801
2802 @contextlib.contextmanager
2803 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002804 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002805 operation, self.codec_name, exc_type.__name__, msg)
2806 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2807 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002808 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002809 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002810
2811 def raise_obj(self, *args, **kwds):
2812 # Helper to dynamically change the object raised by a test codec
2813 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002814
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002815 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002816 self.obj_to_raise = obj_to_raise
2817 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002818 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002819 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002820 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002821 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002822 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002823 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002824 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002825 codecs.decode(b"bytes input", self.codec_name)
2826
2827 def test_raise_by_type(self):
2828 self.check_wrapped(RuntimeError, "")
2829
2830 def test_raise_by_value(self):
2831 msg = "This should be wrapped"
2832 self.check_wrapped(RuntimeError(msg), msg)
2833
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002834 def test_raise_grandchild_subclass_exact_size(self):
2835 msg = "This should be wrapped"
2836 class MyRuntimeError(RuntimeError):
2837 __slots__ = ()
2838 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2839
2840 def test_raise_subclass_with_weakref_support(self):
2841 msg = "This should be wrapped"
2842 class MyRuntimeError(RuntimeError):
2843 pass
2844 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2845
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002846 def check_not_wrapped(self, obj_to_raise, msg):
2847 def raise_obj(*args, **kwds):
2848 raise obj_to_raise
2849 self.set_codec(raise_obj, raise_obj)
2850 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002851 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002852 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002853 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002854 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002855 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002856 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002857 codecs.decode(b"bytes input", self.codec_name)
2858
2859 def test_init_override_is_not_wrapped(self):
2860 class CustomInit(RuntimeError):
2861 def __init__(self):
2862 pass
2863 self.check_not_wrapped(CustomInit, "")
2864
2865 def test_new_override_is_not_wrapped(self):
2866 class CustomNew(RuntimeError):
2867 def __new__(cls):
2868 return super().__new__(cls)
2869 self.check_not_wrapped(CustomNew, "")
2870
2871 def test_instance_attribute_is_not_wrapped(self):
2872 msg = "This should NOT be wrapped"
2873 exc = RuntimeError(msg)
2874 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002875 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002876
2877 def test_non_str_arg_is_not_wrapped(self):
2878 self.check_not_wrapped(RuntimeError(1), "1")
2879
2880 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002881 msg_re = r"^\('a', 'b', 'c'\)$"
2882 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002883
2884 # http://bugs.python.org/issue19609
2885 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002886 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002887 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002888 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002889 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002890 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002891 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002892 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002893 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002894 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002895 codecs.decode(b"bytes input", self.codec_name)
2896
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002897 def test_unflagged_non_text_codec_handling(self):
2898 # The stdlib non-text codecs are now marked so they're
2899 # pre-emptively skipped by the text model related methods
2900 # However, third party codecs won't be flagged, so we still make
2901 # sure the case where an inappropriate output type is produced is
2902 # handled appropriately
2903 def encode_to_str(*args, **kwds):
2904 return "not bytes!", 0
2905 def decode_to_bytes(*args, **kwds):
2906 return b"not str!", 0
2907 self.set_codec(encode_to_str, decode_to_bytes)
2908 # No input or output type checks on the codecs module functions
2909 encoded = codecs.encode(None, self.codec_name)
2910 self.assertEqual(encoded, "not bytes!")
2911 decoded = codecs.decode(None, self.codec_name)
2912 self.assertEqual(decoded, b"not str!")
2913 # Text model methods should complain
2914 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
R David Murray44b548d2016-09-08 13:59:53 -04002915 r"use codecs.encode\(\) to encode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002916 msg = fmt.format(self.codec_name)
2917 with self.assertRaisesRegex(TypeError, msg):
2918 "str_input".encode(self.codec_name)
2919 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
R David Murray44b548d2016-09-08 13:59:53 -04002920 r"use codecs.decode\(\) to decode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002921 msg = fmt.format(self.codec_name)
2922 with self.assertRaisesRegex(TypeError, msg):
2923 b"bytes input".decode(self.codec_name)
2924
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002925
Georg Brandl02524622010-12-02 18:06:51 +00002926
Victor Stinner62be4fb2011-10-18 21:46:37 +02002927@unittest.skipUnless(sys.platform == 'win32',
2928 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002929class CodePageTest(unittest.TestCase):
2930 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002931
Victor Stinner3a50e702011-10-18 21:21:00 +02002932 def test_invalid_code_page(self):
2933 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2934 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002935 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2936 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002937
2938 def test_code_page_name(self):
2939 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2940 codecs.code_page_encode, 932, '\xff')
2941 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002942 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002943 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002944 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002945
2946 def check_decode(self, cp, tests):
2947 for raw, errors, expected in tests:
2948 if expected is not None:
2949 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01002950 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002951 except UnicodeDecodeError as err:
2952 self.fail('Unable to decode %a from "cp%s" with '
2953 'errors=%r: %s' % (raw, cp, errors, err))
2954 self.assertEqual(decoded[0], expected,
2955 '%a.decode("cp%s", %r)=%a != %a'
2956 % (raw, cp, errors, decoded[0], expected))
2957 # assert 0 <= decoded[1] <= len(raw)
2958 self.assertGreaterEqual(decoded[1], 0)
2959 self.assertLessEqual(decoded[1], len(raw))
2960 else:
2961 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01002962 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002963
2964 def check_encode(self, cp, tests):
2965 for text, errors, expected in tests:
2966 if expected is not None:
2967 try:
2968 encoded = codecs.code_page_encode(cp, text, errors)
2969 except UnicodeEncodeError as err:
2970 self.fail('Unable to encode %a to "cp%s" with '
2971 'errors=%r: %s' % (text, cp, errors, err))
2972 self.assertEqual(encoded[0], expected,
2973 '%a.encode("cp%s", %r)=%a != %a'
2974 % (text, cp, errors, encoded[0], expected))
2975 self.assertEqual(encoded[1], len(text))
2976 else:
2977 self.assertRaises(UnicodeEncodeError,
2978 codecs.code_page_encode, cp, text, errors)
2979
2980 def test_cp932(self):
2981 self.check_encode(932, (
2982 ('abc', 'strict', b'abc'),
2983 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002984 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002985 ('\xff', 'strict', None),
2986 ('[\xff]', 'ignore', b'[]'),
2987 ('[\xff]', 'replace', b'[y]'),
2988 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002989 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02002990 ('[\xff]', 'namereplace',
2991 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002992 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002993 ('\udcff', 'strict', None),
2994 ('[\udcff]', 'surrogateescape', b'[\xff]'),
2995 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002996 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002997 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002998 (b'abc', 'strict', 'abc'),
2999 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
3000 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003001 (b'[\xff]', 'strict', None),
3002 (b'[\xff]', 'ignore', '[]'),
3003 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02003004 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003005 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003006 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003007 (b'\x81\x00abc', 'strict', None),
3008 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003009 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01003010 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003011 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003012
3013 def test_cp1252(self):
3014 self.check_encode(1252, (
3015 ('abc', 'strict', b'abc'),
3016 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
3017 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003018 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003019 ('\u0141', 'strict', None),
3020 ('\u0141', 'ignore', b''),
3021 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003022 ('\udc98', 'surrogateescape', b'\x98'),
3023 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003024 ))
3025 self.check_decode(1252, (
3026 (b'abc', 'strict', 'abc'),
3027 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3028 (b'\xff', 'strict', '\xff'),
3029 ))
3030
3031 def test_cp_utf7(self):
3032 cp = 65000
3033 self.check_encode(cp, (
3034 ('abc', 'strict', b'abc'),
3035 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
3036 ('\U0010ffff', 'strict', b'+2//f/w-'),
3037 ('\udc80', 'strict', b'+3IA-'),
3038 ('\ufffd', 'strict', b'+//0-'),
3039 ))
3040 self.check_decode(cp, (
3041 (b'abc', 'strict', 'abc'),
3042 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3043 (b'+2//f/w-', 'strict', '\U0010ffff'),
3044 (b'+3IA-', 'strict', '\udc80'),
3045 (b'+//0-', 'strict', '\ufffd'),
3046 # invalid bytes
3047 (b'[+/]', 'strict', '[]'),
3048 (b'[\xff]', 'strict', '[\xff]'),
3049 ))
3050
Victor Stinner3a50e702011-10-18 21:21:00 +02003051 def test_multibyte_encoding(self):
3052 self.check_decode(932, (
3053 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3054 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3055 ))
3056 self.check_decode(self.CP_UTF8, (
3057 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3058 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3059 ))
Steve Dowerf5aba582016-09-06 19:42:27 -07003060 self.check_encode(self.CP_UTF8, (
3061 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3062 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3063 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003064
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003065 def test_code_page_decode_flags(self):
3066 # Issue #36312: For some code pages (e.g. UTF-7) flags for
3067 # MultiByteToWideChar() must be set to 0.
Paul Monson62dfd7d2019-04-25 11:36:45 -07003068 if support.verbose:
3069 sys.stdout.write('\n')
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003070 for cp in (50220, 50221, 50222, 50225, 50227, 50229,
3071 *range(57002, 57011+1), 65000):
Paul Monson62dfd7d2019-04-25 11:36:45 -07003072 # On small versions of Windows like Windows IoT
3073 # not all codepages are present.
3074 # A missing codepage causes an OSError exception
3075 # so check for the codepage before decoding
3076 if is_code_page_present(cp):
3077 self.assertEqual(codecs.code_page_decode(cp, b'abc'), ('abc', 3), f'cp{cp}')
3078 else:
3079 if support.verbose:
3080 print(f" skipping cp={cp}")
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003081 self.assertEqual(codecs.code_page_decode(42, b'abc'),
3082 ('\uf061\uf062\uf063', 3))
3083
Victor Stinner3a50e702011-10-18 21:21:00 +02003084 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003085 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3086 self.assertEqual(decoded, ('', 0))
3087
Victor Stinner3a50e702011-10-18 21:21:00 +02003088 decoded = codecs.code_page_decode(932,
3089 b'\xe9\x80\xe9', 'strict',
3090 False)
3091 self.assertEqual(decoded, ('\u9a3e', 2))
3092
3093 decoded = codecs.code_page_decode(932,
3094 b'\xe9\x80\xe9\x80', 'strict',
3095 False)
3096 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3097
3098 decoded = codecs.code_page_decode(932,
3099 b'abc', 'strict',
3100 False)
3101 self.assertEqual(decoded, ('abc', 3))
3102
Steve Dowerf5aba582016-09-06 19:42:27 -07003103 def test_mbcs_alias(self):
3104 # Check that looking up our 'default' codepage will return
3105 # mbcs when we don't have a more specific one available
Victor Stinner91106cd2017-12-13 12:29:09 +01003106 with mock.patch('_winapi.GetACP', return_value=123):
Steve Dowerf5aba582016-09-06 19:42:27 -07003107 codec = codecs.lookup('cp123')
3108 self.assertEqual(codec.name, 'mbcs')
Steve Dowerf5aba582016-09-06 19:42:27 -07003109
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003110 @support.bigmemtest(size=2**31, memuse=7, dry_run=False)
Steve Dower7ebdda02019-08-21 16:22:33 -07003111 def test_large_input(self, size):
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003112 # Test input longer than INT_MAX.
3113 # Input should contain undecodable bytes before and after
3114 # the INT_MAX limit.
Steve Dower7ebdda02019-08-21 16:22:33 -07003115 encoded = (b'01234567' * ((size//8)-1) +
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003116 b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff')
Steve Dower7ebdda02019-08-21 16:22:33 -07003117 self.assertEqual(len(encoded), size+2)
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003118 decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True)
3119 self.assertEqual(decoded[1], len(encoded))
3120 del encoded
3121 self.assertEqual(len(decoded[0]), decoded[1])
3122 self.assertEqual(decoded[0][:10], '0123456701')
3123 self.assertEqual(decoded[0][-20:],
3124 '6701234567'
3125 '\udc85\udc86\udcea\udceb\udcec'
3126 '\udcef\udcfc\udcfd\udcfe\udcff')
3127
Steve Dower7ebdda02019-08-21 16:22:33 -07003128 @support.bigmemtest(size=2**31, memuse=6, dry_run=False)
3129 def test_large_utf8_input(self, size):
3130 # Test input longer than INT_MAX.
3131 # Input should contain a decodable multi-byte character
3132 # surrounding INT_MAX
3133 encoded = (b'0123456\xed\x84\x80' * (size//8))
3134 self.assertEqual(len(encoded), size // 8 * 10)
3135 decoded = codecs.code_page_decode(65001, encoded, 'ignore', True)
3136 self.assertEqual(decoded[1], len(encoded))
3137 del encoded
3138 self.assertEqual(len(decoded[0]), size)
3139 self.assertEqual(decoded[0][:10], '0123456\ud10001')
3140 self.assertEqual(decoded[0][-11:], '56\ud1000123456\ud100')
3141
Victor Stinner3a50e702011-10-18 21:21:00 +02003142
Victor Stinnerf96418d2015-09-21 23:06:27 +02003143class ASCIITest(unittest.TestCase):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003144 def test_encode(self):
3145 self.assertEqual('abc123'.encode('ascii'), b'abc123')
3146
3147 def test_encode_error(self):
3148 for data, error_handler, expected in (
3149 ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3150 ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3151 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003152 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3153 b'[\\x80\\xff\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003154 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3155 ):
3156 with self.subTest(data=data, error_handler=error_handler,
3157 expected=expected):
3158 self.assertEqual(data.encode('ascii', error_handler),
3159 expected)
3160
3161 def test_encode_surrogateescape_error(self):
3162 with self.assertRaises(UnicodeEncodeError):
3163 # the first character can be decoded, but not the second
3164 '\udc80\xff'.encode('ascii', 'surrogateescape')
3165
Victor Stinnerf96418d2015-09-21 23:06:27 +02003166 def test_decode(self):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003167 self.assertEqual(b'abc'.decode('ascii'), 'abc')
3168
3169 def test_decode_error(self):
Victor Stinnerf96418d2015-09-21 23:06:27 +02003170 for data, error_handler, expected in (
3171 (b'[\x80\xff]', 'ignore', '[]'),
3172 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3173 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3174 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3175 ):
3176 with self.subTest(data=data, error_handler=error_handler,
3177 expected=expected):
3178 self.assertEqual(data.decode('ascii', error_handler),
3179 expected)
3180
3181
Victor Stinnerc3713e92015-09-29 12:32:13 +02003182class Latin1Test(unittest.TestCase):
3183 def test_encode(self):
3184 for data, expected in (
3185 ('abc', b'abc'),
3186 ('\x80\xe9\xff', b'\x80\xe9\xff'),
3187 ):
3188 with self.subTest(data=data, expected=expected):
3189 self.assertEqual(data.encode('latin1'), expected)
3190
3191 def test_encode_errors(self):
3192 for data, error_handler, expected in (
3193 ('[\u20ac\udc80]', 'ignore', b'[]'),
3194 ('[\u20ac\udc80]', 'replace', b'[??]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003195 ('[\u20ac\U000abcde]', 'backslashreplace',
3196 b'[\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003197 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3198 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3199 ):
3200 with self.subTest(data=data, error_handler=error_handler,
3201 expected=expected):
3202 self.assertEqual(data.encode('latin1', error_handler),
3203 expected)
3204
3205 def test_encode_surrogateescape_error(self):
3206 with self.assertRaises(UnicodeEncodeError):
3207 # the first character can be decoded, but not the second
3208 '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3209
3210 def test_decode(self):
3211 for data, expected in (
3212 (b'abc', 'abc'),
3213 (b'[\x80\xff]', '[\x80\xff]'),
3214 ):
3215 with self.subTest(data=data, expected=expected):
3216 self.assertEqual(data.decode('latin1'), expected)
3217
3218
Jelle Zijlstrab3be4072019-05-22 08:18:26 -07003219class StreamRecoderTest(unittest.TestCase):
3220 def test_writelines(self):
3221 bio = io.BytesIO()
3222 codec = codecs.lookup('ascii')
3223 sr = codecs.StreamRecoder(bio, codec.encode, codec.decode,
3224 encodings.ascii.StreamReader, encodings.ascii.StreamWriter)
3225 sr.writelines([b'a', b'b'])
3226 self.assertEqual(bio.getvalue(), b'ab')
3227
3228 def test_write(self):
3229 bio = io.BytesIO()
3230 codec = codecs.lookup('latin1')
3231 # Recode from Latin-1 to utf-8.
3232 sr = codecs.StreamRecoder(bio, codec.encode, codec.decode,
3233 encodings.utf_8.StreamReader, encodings.utf_8.StreamWriter)
3234
3235 text = 'àñé'
3236 sr.write(text.encode('latin1'))
3237 self.assertEqual(bio.getvalue(), text.encode('utf-8'))
3238
Ammar Askara6ec1ce2019-05-31 12:44:01 -07003239 def test_seeking_read(self):
3240 bio = io.BytesIO('line1\nline2\nline3\n'.encode('utf-16-le'))
3241 sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le')
3242
3243 self.assertEqual(sr.readline(), b'line1\n')
3244 sr.seek(0)
3245 self.assertEqual(sr.readline(), b'line1\n')
3246 self.assertEqual(sr.readline(), b'line2\n')
3247 self.assertEqual(sr.readline(), b'line3\n')
3248 self.assertEqual(sr.readline(), b'')
3249
3250 def test_seeking_write(self):
3251 bio = io.BytesIO('123456789\n'.encode('utf-16-le'))
3252 sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le')
3253
3254 # Test that seek() only resets its internal buffer when offset
3255 # and whence are zero.
3256 sr.seek(2)
3257 sr.write(b'\nabc\n')
3258 self.assertEqual(sr.readline(), b'789\n')
3259 sr.seek(0)
3260 self.assertEqual(sr.readline(), b'1\n')
3261 self.assertEqual(sr.readline(), b'abc\n')
3262 self.assertEqual(sr.readline(), b'789\n')
3263
Jelle Zijlstrab3be4072019-05-22 08:18:26 -07003264
Victor Stinner3d4226a2018-08-29 22:21:32 +02003265@unittest.skipIf(_testcapi is None, 'need _testcapi module')
3266class LocaleCodecTest(unittest.TestCase):
3267 """
3268 Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex().
3269 """
3270 ENCODING = sys.getfilesystemencoding()
3271 STRINGS = ("ascii", "ulatin1:\xa7\xe9",
3272 "u255:\xff",
3273 "UCS:\xe9\u20ac\U0010ffff",
3274 "surrogates:\uDC80\uDCFF")
3275 BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff")
3276 SURROGATES = "\uDC80\uDCFF"
3277
3278 def encode(self, text, errors="strict"):
3279 return _testcapi.EncodeLocaleEx(text, 0, errors)
3280
3281 def check_encode_strings(self, errors):
3282 for text in self.STRINGS:
3283 with self.subTest(text=text):
3284 try:
3285 expected = text.encode(self.ENCODING, errors)
3286 except UnicodeEncodeError:
3287 with self.assertRaises(RuntimeError) as cm:
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003288 self.encode(text, errors)
Victor Stinner3d4226a2018-08-29 22:21:32 +02003289 errmsg = str(cm.exception)
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003290 self.assertRegex(errmsg, r"encode error: pos=[0-9]+, reason=")
Victor Stinner3d4226a2018-08-29 22:21:32 +02003291 else:
3292 encoded = self.encode(text, errors)
3293 self.assertEqual(encoded, expected)
3294
3295 def test_encode_strict(self):
3296 self.check_encode_strings("strict")
3297
3298 def test_encode_surrogateescape(self):
3299 self.check_encode_strings("surrogateescape")
3300
3301 def test_encode_surrogatepass(self):
3302 try:
3303 self.encode('', 'surrogatepass')
3304 except ValueError as exc:
3305 if str(exc) == 'unsupported error handler':
3306 self.skipTest(f"{self.ENCODING!r} encoder doesn't support "
3307 f"surrogatepass error handler")
3308 else:
3309 raise
3310
3311 self.check_encode_strings("surrogatepass")
3312
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003313 def test_encode_unsupported_error_handler(self):
3314 with self.assertRaises(ValueError) as cm:
3315 self.encode('', 'backslashreplace')
3316 self.assertEqual(str(cm.exception), 'unsupported error handler')
3317
Victor Stinner3d4226a2018-08-29 22:21:32 +02003318 def decode(self, encoded, errors="strict"):
3319 return _testcapi.DecodeLocaleEx(encoded, 0, errors)
3320
3321 def check_decode_strings(self, errors):
3322 is_utf8 = (self.ENCODING == "utf-8")
3323 if is_utf8:
3324 encode_errors = 'surrogateescape'
3325 else:
3326 encode_errors = 'strict'
3327
3328 strings = list(self.BYTES_STRINGS)
3329 for text in self.STRINGS:
3330 try:
3331 encoded = text.encode(self.ENCODING, encode_errors)
3332 if encoded not in strings:
3333 strings.append(encoded)
3334 except UnicodeEncodeError:
3335 encoded = None
3336
3337 if is_utf8:
3338 encoded2 = text.encode(self.ENCODING, 'surrogatepass')
3339 if encoded2 != encoded:
3340 strings.append(encoded2)
3341
3342 for encoded in strings:
3343 with self.subTest(encoded=encoded):
3344 try:
3345 expected = encoded.decode(self.ENCODING, errors)
3346 except UnicodeDecodeError:
3347 with self.assertRaises(RuntimeError) as cm:
3348 self.decode(encoded, errors)
3349 errmsg = str(cm.exception)
3350 self.assertTrue(errmsg.startswith("decode error: "), errmsg)
3351 else:
3352 decoded = self.decode(encoded, errors)
3353 self.assertEqual(decoded, expected)
3354
3355 def test_decode_strict(self):
3356 self.check_decode_strings("strict")
3357
3358 def test_decode_surrogateescape(self):
3359 self.check_decode_strings("surrogateescape")
3360
3361 def test_decode_surrogatepass(self):
3362 try:
3363 self.decode(b'', 'surrogatepass')
3364 except ValueError as exc:
3365 if str(exc) == 'unsupported error handler':
3366 self.skipTest(f"{self.ENCODING!r} decoder doesn't support "
3367 f"surrogatepass error handler")
3368 else:
3369 raise
3370
3371 self.check_decode_strings("surrogatepass")
3372
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003373 def test_decode_unsupported_error_handler(self):
3374 with self.assertRaises(ValueError) as cm:
3375 self.decode(b'', 'backslashreplace')
3376 self.assertEqual(str(cm.exception), 'unsupported error handler')
3377
Victor Stinner3d4226a2018-08-29 22:21:32 +02003378
Zethb3b48c82019-09-09 15:50:36 +01003379class Rot13Test(unittest.TestCase):
3380 """Test the educational ROT-13 codec."""
3381 def test_encode(self):
3382 ciphertext = codecs.encode("Caesar liked ciphers", 'rot-13')
3383 self.assertEqual(ciphertext, 'Pnrfne yvxrq pvcuref')
3384
3385 def test_decode(self):
3386 plaintext = codecs.decode('Rg gh, Oehgr?', 'rot-13')
3387 self.assertEqual(plaintext, 'Et tu, Brute?')
3388
3389 def test_incremental_encode(self):
3390 encoder = codecs.getincrementalencoder('rot-13')()
3391 ciphertext = encoder.encode('ABBA nag Cheryl Baker')
3392 self.assertEqual(ciphertext, 'NOON ant Purely Onxre')
3393
3394 def test_incremental_decode(self):
3395 decoder = codecs.getincrementaldecoder('rot-13')()
3396 plaintext = decoder.decode('terra Ares envy tha')
3397 self.assertEqual(plaintext, 'green Nerf rail gun')
3398
3399
3400class Rot13UtilTest(unittest.TestCase):
3401 """Test the ROT-13 codec via rot13 function,
3402 i.e. the user has done something like:
3403 $ echo "Hello World" | python -m encodings.rot_13
3404 """
3405 def test_rot13_func(self):
3406 infile = io.StringIO('Gb or, be abg gb or, gung vf gur dhrfgvba')
3407 outfile = io.StringIO()
3408 encodings.rot_13.rot13(infile, outfile)
3409 outfile.seek(0)
3410 plain_text = outfile.read()
3411 self.assertEqual(
3412 plain_text,
3413 'To be, or not to be, that is the question')
3414
3415
Fred Drake2e2be372001-09-20 21:33:42 +00003416if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003417 unittest.main()