blob: 8d9cb9089039cf2492035bbbeba136cac2798ac2 [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
Nick Coghlanc72e4e62013-11-22 22:39:36 +10007import encodings
Victor Stinner91106cd2017-12-13 12:29:09 +01008from unittest import mock
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Antoine Pitrou00b2c862011-10-05 13:01:41 +020012try:
Victor Stinner3d4226a2018-08-29 22:21:32 +020013 import _testcapi
Pablo Galindo293dd232019-11-19 21:34:03 +000014except ImportError:
Victor Stinner3d4226a2018-08-29 22:21:32 +020015 _testcapi = None
16
17try:
Antoine Pitrou00b2c862011-10-05 13:01:41 +020018 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Paul Monson62dfd7d2019-04-25 11:36:45 -070030# On small versions of Windows like Windows IoT or Windows Nano Server not all codepages are present
31def is_code_page_present(cp):
Victor Stinner8f4ef3b2019-07-01 18:28:25 +020032 from ctypes import POINTER, WINFUNCTYPE, WinDLL
Paul Monson62dfd7d2019-04-25 11:36:45 -070033 from ctypes.wintypes import BOOL, UINT, BYTE, WCHAR, UINT, DWORD
34
35 MAX_LEADBYTES = 12 # 5 ranges, 2 bytes ea., 0 term.
36 MAX_DEFAULTCHAR = 2 # single or double byte
37 MAX_PATH = 260
38 class CPINFOEXW(ctypes.Structure):
39 _fields_ = [("MaxCharSize", UINT),
40 ("DefaultChar", BYTE*MAX_DEFAULTCHAR),
41 ("LeadByte", BYTE*MAX_LEADBYTES),
42 ("UnicodeDefaultChar", WCHAR),
43 ("CodePage", UINT),
44 ("CodePageName", WCHAR*MAX_PATH)]
45
46 prototype = WINFUNCTYPE(BOOL, UINT, DWORD, POINTER(CPINFOEXW))
47 GetCPInfoEx = prototype(("GetCPInfoExW", WinDLL("kernel32")))
48 info = CPINFOEXW()
49 return GetCPInfoEx(cp, 0, info)
Victor Stinnerf96418d2015-09-21 23:06:27 +020050
Walter Dörwald69652032004-09-07 20:24:22 +000051class Queue(object):
52 """
53 queue: write bytes at one end, read bytes from the other end
54 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000055 def __init__(self, buffer):
56 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000057
58 def write(self, chars):
59 self._buffer += chars
60
61 def read(self, size=-1):
62 if size<0:
63 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000064 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000065 return s
66 else:
67 s = self._buffer[:size]
68 self._buffer = self._buffer[size:]
69 return s
70
Victor Stinnerf96418d2015-09-21 23:06:27 +020071
Walter Dörwald3abcb012007-04-16 22:10:50 +000072class MixInCheckStateHandling:
73 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000074 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000075 d = codecs.getincrementaldecoder(encoding)()
76 part1 = d.decode(s[:i])
77 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000078 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000079 # Check that the condition stated in the documentation for
80 # IncrementalDecoder.getstate() holds
81 if not state[1]:
82 # reset decoder to the default state without anything buffered
83 d.setstate((state[0][:0], 0))
84 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000085 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000086 # The decoder must return to the same state
87 self.assertEqual(state, d.getstate())
88 # Create a new decoder and set it to the state
89 # we extracted from the old one
90 d = codecs.getincrementaldecoder(encoding)()
91 d.setstate(state)
92 part2 = d.decode(s[i:], True)
93 self.assertEqual(u, part1+part2)
94
95 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000096 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000097 d = codecs.getincrementalencoder(encoding)()
98 part1 = d.encode(u[:i])
99 state = d.getstate()
100 d = codecs.getincrementalencoder(encoding)()
101 d.setstate(state)
102 part2 = d.encode(u[i:], True)
103 self.assertEqual(s, part1+part2)
104
Victor Stinnerf96418d2015-09-21 23:06:27 +0200105
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200106class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000107 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +0000108 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000109 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +0000110 # the StreamReader and check that the results equal the appropriate
111 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000112 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200113 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000114 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000115 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000116 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +0000117 result += r.read()
118 self.assertEqual(result, partialresult)
119 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000120 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000121 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +0000122
Martin Panter7462b6492015-11-02 03:37:02 +0000123 # do the check again, this time using an incremental decoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000124 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000125 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000126 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000127 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000128 self.assertEqual(result, partialresult)
129 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000130 self.assertEqual(d.decode(b"", True), "")
131 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000132
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000133 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000134 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000135 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000136 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000137 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000138 self.assertEqual(result, partialresult)
139 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000140 self.assertEqual(d.decode(b"", True), "")
141 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000142
143 # check iterdecode()
144 encoded = input.encode(self.encoding)
145 self.assertEqual(
146 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000147 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000148 )
149
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000150 def test_readline(self):
151 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000152 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000153 return codecs.getreader(self.encoding)(stream)
154
Walter Dörwaldca199432006-03-06 22:39:12 +0000155 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200156 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000157 lines = []
158 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000159 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000160 if not line:
161 break
162 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000163 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000164
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000165 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
166 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
167 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000168 self.assertEqual(readalllines(s, True), sexpected)
169 self.assertEqual(readalllines(s, False), sexpectednoends)
170 self.assertEqual(readalllines(s, True, 10), sexpected)
171 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000172
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200173 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000174 # Test long lines (multiple calls to read() in readline())
175 vw = []
176 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200177 for (i, lineend) in enumerate(lineends):
178 vw.append((i*200+200)*"\u3042" + lineend)
179 vwo.append((i*200+200)*"\u3042")
180 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
181 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000182
183 # Test lines where the first read might end with \r, so the
184 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000185 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200186 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000187 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000188 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000189 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000190 self.assertEqual(
191 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000192 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000193 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200194 self.assertEqual(
195 reader.readline(keepends=True),
196 "xxx\n",
197 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000198 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000199 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000200 self.assertEqual(
201 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000202 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000203 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200204 self.assertEqual(
205 reader.readline(keepends=False),
206 "xxx",
207 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000208
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200209 def test_mixed_readline_and_read(self):
210 lines = ["Humpty Dumpty sat on a wall,\n",
211 "Humpty Dumpty had a great fall.\r\n",
212 "All the king's horses and all the king's men\r",
213 "Couldn't put Humpty together again."]
214 data = ''.join(lines)
215 def getreader():
216 stream = io.BytesIO(data.encode(self.encoding))
217 return codecs.getreader(self.encoding)(stream)
218
219 # Issue #8260: Test readline() followed by read()
220 f = getreader()
221 self.assertEqual(f.readline(), lines[0])
222 self.assertEqual(f.read(), ''.join(lines[1:]))
223 self.assertEqual(f.read(), '')
224
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200225 # Issue #32110: Test readline() followed by read(n)
226 f = getreader()
227 self.assertEqual(f.readline(), lines[0])
228 self.assertEqual(f.read(1), lines[1][0])
229 self.assertEqual(f.read(0), '')
230 self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100])
231
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200232 # Issue #16636: Test readline() followed by readlines()
233 f = getreader()
234 self.assertEqual(f.readline(), lines[0])
235 self.assertEqual(f.readlines(), lines[1:])
236 self.assertEqual(f.read(), '')
237
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200238 # Test read(n) followed by read()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200239 f = getreader()
240 self.assertEqual(f.read(size=40, chars=5), data[:5])
241 self.assertEqual(f.read(), data[5:])
242 self.assertEqual(f.read(), '')
243
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200244 # Issue #32110: Test read(n) followed by read(n)
245 f = getreader()
246 self.assertEqual(f.read(size=40, chars=5), data[:5])
247 self.assertEqual(f.read(1), data[5])
248 self.assertEqual(f.read(0), '')
249 self.assertEqual(f.read(100), data[6:106])
250
251 # Issue #12446: Test read(n) followed by readlines()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200252 f = getreader()
253 self.assertEqual(f.read(size=40, chars=5), data[:5])
254 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
255 self.assertEqual(f.read(), '')
256
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000257 def test_bug1175396(self):
258 s = [
259 '<%!--===================================================\r\n',
260 ' BLOG index page: show recent articles,\r\n',
261 ' today\'s articles, or articles of a specific date.\r\n',
262 '========================================================--%>\r\n',
263 '<%@inputencoding="ISO-8859-1"%>\r\n',
264 '<%@pagetemplate=TEMPLATE.y%>\r\n',
265 '<%@import=import frog.util, frog%>\r\n',
266 '<%@import=import frog.objects%>\r\n',
267 '<%@import=from frog.storageerrors import StorageError%>\r\n',
268 '<%\r\n',
269 '\r\n',
270 'import logging\r\n',
271 'log=logging.getLogger("Snakelets.logger")\r\n',
272 '\r\n',
273 '\r\n',
274 'user=self.SessionCtx.user\r\n',
275 'storageEngine=self.SessionCtx.storageEngine\r\n',
276 '\r\n',
277 '\r\n',
278 'def readArticlesFromDate(date, count=None):\r\n',
279 ' entryids=storageEngine.listBlogEntries(date)\r\n',
280 ' entryids.reverse() # descending\r\n',
281 ' if count:\r\n',
282 ' entryids=entryids[:count]\r\n',
283 ' try:\r\n',
284 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
285 ' except StorageError,x:\r\n',
286 ' log.error("Error loading articles: "+str(x))\r\n',
287 ' self.abort("cannot load articles")\r\n',
288 '\r\n',
289 'showdate=None\r\n',
290 '\r\n',
291 'arg=self.Request.getArg()\r\n',
292 'if arg=="today":\r\n',
293 ' #-------------------- TODAY\'S ARTICLES\r\n',
294 ' self.write("<h2>Today\'s articles</h2>")\r\n',
295 ' showdate = frog.util.isodatestr() \r\n',
296 ' entries = readArticlesFromDate(showdate)\r\n',
297 'elif arg=="active":\r\n',
298 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
299 ' self.Yredirect("active.y")\r\n',
300 'elif arg=="login":\r\n',
301 ' #-------------------- LOGIN PAGE redirect\r\n',
302 ' self.Yredirect("login.y")\r\n',
303 'elif arg=="date":\r\n',
304 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
305 ' showdate = self.Request.getParameter("date")\r\n',
306 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
307 ' entries = readArticlesFromDate(showdate)\r\n',
308 'else:\r\n',
309 ' #-------------------- RECENT ARTICLES\r\n',
310 ' self.write("<h2>Recent articles</h2>")\r\n',
311 ' dates=storageEngine.listBlogEntryDates()\r\n',
312 ' if dates:\r\n',
313 ' entries=[]\r\n',
314 ' SHOWAMOUNT=10\r\n',
315 ' for showdate in dates:\r\n',
316 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
317 ' if len(entries)>=SHOWAMOUNT:\r\n',
318 ' break\r\n',
319 ' \r\n',
320 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000321 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200322 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000323 for (i, line) in enumerate(reader):
324 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000325
326 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000327 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200328 writer = codecs.getwriter(self.encoding)(q)
329 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000330
331 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000332 writer.write("foo\r")
333 self.assertEqual(reader.readline(keepends=False), "foo")
334 writer.write("\nbar\r")
335 self.assertEqual(reader.readline(keepends=False), "")
336 self.assertEqual(reader.readline(keepends=False), "bar")
337 writer.write("baz")
338 self.assertEqual(reader.readline(keepends=False), "baz")
339 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000340
341 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000342 writer.write("foo\r")
343 self.assertEqual(reader.readline(keepends=True), "foo\r")
344 writer.write("\nbar\r")
345 self.assertEqual(reader.readline(keepends=True), "\n")
346 self.assertEqual(reader.readline(keepends=True), "bar\r")
347 writer.write("baz")
348 self.assertEqual(reader.readline(keepends=True), "baz")
349 self.assertEqual(reader.readline(keepends=True), "")
350 writer.write("foo\r\n")
351 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000352
Walter Dörwald9fa09462005-01-10 12:01:39 +0000353 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000354 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
355 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
356 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000357
358 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000359 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200360 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000361 self.assertEqual(reader.readline(), s1)
362 self.assertEqual(reader.readline(), s2)
363 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000364 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000365
366 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000367 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
368 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
369 s3 = "stillokay:bbbbxx\r\n"
370 s4 = "broken!!!!badbad\r\n"
371 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000372
373 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000374 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200375 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000376 self.assertEqual(reader.readline(), s1)
377 self.assertEqual(reader.readline(), s2)
378 self.assertEqual(reader.readline(), s3)
379 self.assertEqual(reader.readline(), s4)
380 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000381 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000382
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200383 ill_formed_sequence_replace = "\ufffd"
384
385 def test_lone_surrogates(self):
386 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
387 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
388 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200389 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
390 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200391 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
392 "[&#56448;]".encode(self.encoding))
393 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
394 "[]".encode(self.encoding))
395 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
396 "[?]".encode(self.encoding))
397
Victor Stinner01ada392015-10-01 21:54:51 +0200398 # sequential surrogate characters
399 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
400 "[]".encode(self.encoding))
401 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
402 "[??]".encode(self.encoding))
403
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200404 bom = "".encode(self.encoding)
405 for before, after in [("\U00010fff", "A"), ("[", "]"),
406 ("A", "\U00010fff")]:
407 before_sequence = before.encode(self.encoding)[len(bom):]
408 after_sequence = after.encode(self.encoding)[len(bom):]
409 test_string = before + "\uDC80" + after
410 test_sequence = (bom + before_sequence +
411 self.ill_formed_sequence + after_sequence)
412 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
413 self.encoding)
414 self.assertEqual(test_string.encode(self.encoding,
415 "surrogatepass"),
416 test_sequence)
417 self.assertEqual(test_sequence.decode(self.encoding,
418 "surrogatepass"),
419 test_string)
420 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
421 before + after)
422 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
423 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200424 backslashreplace = ''.join('\\x%02x' % b
425 for b in self.ill_formed_sequence)
426 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
427 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200428
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +0200429 def test_incremental_surrogatepass(self):
430 # Test incremental decoder for surrogatepass handler:
431 # see issue #24214
Serhiy Storchaka894263b2019-06-25 11:54:18 +0300432 # High surrogate
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +0200433 data = '\uD901'.encode(self.encoding, 'surrogatepass')
434 for i in range(1, len(data)):
435 dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
436 self.assertEqual(dec.decode(data[:i]), '')
437 self.assertEqual(dec.decode(data[i:], True), '\uD901')
Serhiy Storchaka894263b2019-06-25 11:54:18 +0300438 # Low surrogate
439 data = '\uDC02'.encode(self.encoding, 'surrogatepass')
440 for i in range(1, len(data)):
441 dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
442 self.assertEqual(dec.decode(data[:i]), '')
443 self.assertEqual(dec.decode(data[i:]), '\uDC02')
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +0200444
Victor Stinnerf96418d2015-09-21 23:06:27 +0200445
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200446class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000447 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200448 if sys.byteorder == 'little':
449 ill_formed_sequence = b"\x80\xdc\x00\x00"
450 else:
451 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000452
453 spamle = (b'\xff\xfe\x00\x00'
454 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
455 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
456 spambe = (b'\x00\x00\xfe\xff'
457 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
458 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
459
460 def test_only_one_bom(self):
461 _,_,reader,writer = codecs.lookup(self.encoding)
462 # encode some stream
463 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200464 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000465 f.write("spam")
466 f.write("spam")
467 d = s.getvalue()
468 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000469 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000470 # try to read it back
471 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200472 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000473 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000474
475 def test_badbom(self):
476 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200477 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000478 self.assertRaises(UnicodeError, f.read)
479
480 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200481 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000482 self.assertRaises(UnicodeError, f.read)
483
484 def test_partial(self):
485 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200486 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000487 [
488 "", # first byte of BOM read
489 "", # second byte of BOM read
490 "", # third byte of BOM read
491 "", # fourth byte of BOM read => byteorder known
492 "",
493 "",
494 "",
495 "\x00",
496 "\x00",
497 "\x00",
498 "\x00",
499 "\x00\xff",
500 "\x00\xff",
501 "\x00\xff",
502 "\x00\xff",
503 "\x00\xff\u0100",
504 "\x00\xff\u0100",
505 "\x00\xff\u0100",
506 "\x00\xff\u0100",
507 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200508 "\x00\xff\u0100\uffff",
509 "\x00\xff\u0100\uffff",
510 "\x00\xff\u0100\uffff",
511 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000512 ]
513 )
514
Georg Brandl791f4e12009-09-17 11:41:24 +0000515 def test_handlers(self):
516 self.assertEqual(('\ufffd', 1),
517 codecs.utf_32_decode(b'\x01', 'replace', True))
518 self.assertEqual(('', 1),
519 codecs.utf_32_decode(b'\x01', 'ignore', True))
520
Walter Dörwald41980ca2007-08-16 21:55:45 +0000521 def test_errors(self):
522 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
523 b"\xff", "strict", True)
524
525 def test_decoder_state(self):
526 self.check_state_handling_decode(self.encoding,
527 "spamspam", self.spamle)
528 self.check_state_handling_decode(self.encoding,
529 "spamspam", self.spambe)
530
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000531 def test_issue8941(self):
532 # Issue #8941: insufficient result allocation when decoding into
533 # surrogate pairs on UCS-2 builds.
534 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
535 self.assertEqual('\U00010000' * 1024,
536 codecs.utf_32_decode(encoded_le)[0])
537 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
538 self.assertEqual('\U00010000' * 1024,
539 codecs.utf_32_decode(encoded_be)[0])
540
Victor Stinnerf96418d2015-09-21 23:06:27 +0200541
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200542class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000543 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200544 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000545
546 def test_partial(self):
547 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200548 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000549 [
550 "",
551 "",
552 "",
553 "\x00",
554 "\x00",
555 "\x00",
556 "\x00",
557 "\x00\xff",
558 "\x00\xff",
559 "\x00\xff",
560 "\x00\xff",
561 "\x00\xff\u0100",
562 "\x00\xff\u0100",
563 "\x00\xff\u0100",
564 "\x00\xff\u0100",
565 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200566 "\x00\xff\u0100\uffff",
567 "\x00\xff\u0100\uffff",
568 "\x00\xff\u0100\uffff",
569 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000570 ]
571 )
572
573 def test_simple(self):
574 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
575
576 def test_errors(self):
577 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
578 b"\xff", "strict", True)
579
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000580 def test_issue8941(self):
581 # Issue #8941: insufficient result allocation when decoding into
582 # surrogate pairs on UCS-2 builds.
583 encoded = b'\x00\x00\x01\x00' * 1024
584 self.assertEqual('\U00010000' * 1024,
585 codecs.utf_32_le_decode(encoded)[0])
586
Victor Stinnerf96418d2015-09-21 23:06:27 +0200587
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200588class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000589 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200590 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000591
592 def test_partial(self):
593 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200594 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000595 [
596 "",
597 "",
598 "",
599 "\x00",
600 "\x00",
601 "\x00",
602 "\x00",
603 "\x00\xff",
604 "\x00\xff",
605 "\x00\xff",
606 "\x00\xff",
607 "\x00\xff\u0100",
608 "\x00\xff\u0100",
609 "\x00\xff\u0100",
610 "\x00\xff\u0100",
611 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200612 "\x00\xff\u0100\uffff",
613 "\x00\xff\u0100\uffff",
614 "\x00\xff\u0100\uffff",
615 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000616 ]
617 )
618
619 def test_simple(self):
620 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
621
622 def test_errors(self):
623 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
624 b"\xff", "strict", True)
625
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000626 def test_issue8941(self):
627 # Issue #8941: insufficient result allocation when decoding into
628 # surrogate pairs on UCS-2 builds.
629 encoded = b'\x00\x01\x00\x00' * 1024
630 self.assertEqual('\U00010000' * 1024,
631 codecs.utf_32_be_decode(encoded)[0])
632
633
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200634class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000635 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200636 if sys.byteorder == 'little':
637 ill_formed_sequence = b"\x80\xdc"
638 else:
639 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000640
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000641 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
642 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000643
644 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000645 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000646 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000647 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200648 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000649 f.write("spam")
650 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000651 d = s.getvalue()
652 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000653 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000654 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000655 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200656 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000657 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000658
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000659 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000660 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200661 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000662 self.assertRaises(UnicodeError, f.read)
663
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000664 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200665 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000666 self.assertRaises(UnicodeError, f.read)
667
Walter Dörwald69652032004-09-07 20:24:22 +0000668 def test_partial(self):
669 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200670 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000671 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000672 "", # first byte of BOM read
673 "", # second byte of BOM read => byteorder known
674 "",
675 "\x00",
676 "\x00",
677 "\x00\xff",
678 "\x00\xff",
679 "\x00\xff\u0100",
680 "\x00\xff\u0100",
681 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200682 "\x00\xff\u0100\uffff",
683 "\x00\xff\u0100\uffff",
684 "\x00\xff\u0100\uffff",
685 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000686 ]
687 )
688
Georg Brandl791f4e12009-09-17 11:41:24 +0000689 def test_handlers(self):
690 self.assertEqual(('\ufffd', 1),
691 codecs.utf_16_decode(b'\x01', 'replace', True))
692 self.assertEqual(('', 1),
693 codecs.utf_16_decode(b'\x01', 'ignore', True))
694
Walter Dörwalde22d3392005-11-17 08:52:34 +0000695 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000696 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000697 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000698
699 def test_decoder_state(self):
700 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000701 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000702 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000703 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000704
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000705 def test_bug691291(self):
706 # Files are always opened in binary mode, even if no binary mode was
707 # specified. This means that no automatic conversion of '\n' is done
708 # on reading and writing.
709 s1 = 'Hello\r\nworld\r\n'
710
711 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200712 self.addCleanup(support.unlink, support.TESTFN)
713 with open(support.TESTFN, 'wb') as fp:
714 fp.write(s)
Victor Stinnere471e722019-10-28 15:40:08 +0100715 with codecs.open(support.TESTFN, 'r',
716 encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200717 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000718
Victor Stinnere471e722019-10-28 15:40:08 +0100719 def test_invalid_modes(self):
720 for mode in ('U', 'rU', 'r+U'):
721 with self.assertRaises(ValueError) as cm:
722 codecs.open(support.TESTFN, mode, encoding=self.encoding)
723 self.assertIn('invalid mode', str(cm.exception))
724
725 for mode in ('rt', 'wt', 'at', 'r+t'):
726 with self.assertRaises(ValueError) as cm:
727 codecs.open(support.TESTFN, mode, encoding=self.encoding)
728 self.assertIn("can't have text and binary mode at once",
729 str(cm.exception))
730
731
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200732class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000733 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200734 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000735
736 def test_partial(self):
737 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200738 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000739 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000740 "",
741 "\x00",
742 "\x00",
743 "\x00\xff",
744 "\x00\xff",
745 "\x00\xff\u0100",
746 "\x00\xff\u0100",
747 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200748 "\x00\xff\u0100\uffff",
749 "\x00\xff\u0100\uffff",
750 "\x00\xff\u0100\uffff",
751 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000752 ]
753 )
754
Walter Dörwalde22d3392005-11-17 08:52:34 +0000755 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200756 tests = [
757 (b'\xff', '\ufffd'),
758 (b'A\x00Z', 'A\ufffd'),
759 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
760 (b'\x00\xd8', '\ufffd'),
761 (b'\x00\xd8A', '\ufffd'),
762 (b'\x00\xd8A\x00', '\ufffdA'),
763 (b'\x00\xdcA\x00', '\ufffdA'),
764 ]
765 for raw, expected in tests:
766 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
767 raw, 'strict', True)
768 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000769
Victor Stinner53a9dd72010-12-08 22:25:45 +0000770 def test_nonbmp(self):
771 self.assertEqual("\U00010203".encode(self.encoding),
772 b'\x00\xd8\x03\xde')
773 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
774 "\U00010203")
775
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200776class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000777 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200778 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000779
780 def test_partial(self):
781 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200782 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000783 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000784 "",
785 "\x00",
786 "\x00",
787 "\x00\xff",
788 "\x00\xff",
789 "\x00\xff\u0100",
790 "\x00\xff\u0100",
791 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200792 "\x00\xff\u0100\uffff",
793 "\x00\xff\u0100\uffff",
794 "\x00\xff\u0100\uffff",
795 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000796 ]
797 )
798
Walter Dörwalde22d3392005-11-17 08:52:34 +0000799 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200800 tests = [
801 (b'\xff', '\ufffd'),
802 (b'\x00A\xff', 'A\ufffd'),
803 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
804 (b'\xd8\x00', '\ufffd'),
805 (b'\xd8\x00\xdc', '\ufffd'),
806 (b'\xd8\x00\x00A', '\ufffdA'),
807 (b'\xdc\x00\x00A', '\ufffdA'),
808 ]
809 for raw, expected in tests:
810 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
811 raw, 'strict', True)
812 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000813
Victor Stinner53a9dd72010-12-08 22:25:45 +0000814 def test_nonbmp(self):
815 self.assertEqual("\U00010203".encode(self.encoding),
816 b'\xd8\x00\xde\x03')
817 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
818 "\U00010203")
819
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200820class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000821 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200822 ill_formed_sequence = b"\xed\xb2\x80"
823 ill_formed_sequence_replace = "\ufffd" * 3
Victor Stinner01ada392015-10-01 21:54:51 +0200824 BOM = b''
Walter Dörwald69652032004-09-07 20:24:22 +0000825
826 def test_partial(self):
827 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200828 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000829 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000830 "\x00",
831 "\x00",
832 "\x00\xff",
833 "\x00\xff",
834 "\x00\xff\u07ff",
835 "\x00\xff\u07ff",
836 "\x00\xff\u07ff",
837 "\x00\xff\u07ff\u0800",
838 "\x00\xff\u07ff\u0800",
839 "\x00\xff\u07ff\u0800",
840 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200841 "\x00\xff\u07ff\u0800\uffff",
842 "\x00\xff\u07ff\u0800\uffff",
843 "\x00\xff\u07ff\u0800\uffff",
844 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000845 ]
846 )
847
Walter Dörwald3abcb012007-04-16 22:10:50 +0000848 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000849 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000850 self.check_state_handling_decode(self.encoding,
851 u, u.encode(self.encoding))
852
Victor Stinner1d65d912015-10-05 13:43:50 +0200853 def test_decode_error(self):
854 for data, error_handler, expected in (
855 (b'[\x80\xff]', 'ignore', '[]'),
856 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
857 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
858 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
859 ):
860 with self.subTest(data=data, error_handler=error_handler,
861 expected=expected):
862 self.assertEqual(data.decode(self.encoding, error_handler),
863 expected)
864
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000865 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200866 super().test_lone_surrogates()
867 # not sure if this is making sense for
868 # UTF-16 and UTF-32
Victor Stinner01ada392015-10-01 21:54:51 +0200869 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
870 self.BOM + b'[\x80]')
871
872 with self.assertRaises(UnicodeEncodeError) as cm:
873 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
874 exc = cm.exception
875 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000876
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000877 def test_surrogatepass_handler(self):
Victor Stinner01ada392015-10-01 21:54:51 +0200878 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
879 self.BOM + b"abc\xed\xa0\x80def")
880 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
881 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
882 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
883 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
884
885 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
Ezio Melottib3aedd42010-11-20 19:04:17 +0000886 "abc\ud800def")
Victor Stinner01ada392015-10-01 21:54:51 +0200887 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200888 "\U00010fff\uD800")
Victor Stinner01ada392015-10-01 21:54:51 +0200889
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000890 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700891 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200892 b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200893 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200894 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000895
Serhiy Storchaka894263b2019-06-25 11:54:18 +0300896 def test_incremental_errors(self):
897 # Test that the incremental decoder can fail with final=False.
898 # See issue #24214
899 cases = [b'\x80', b'\xBF', b'\xC0', b'\xC1', b'\xF5', b'\xF6', b'\xFF']
900 for prefix in (b'\xC2', b'\xDF', b'\xE0', b'\xE0\xA0', b'\xEF',
901 b'\xEF\xBF', b'\xF0', b'\xF0\x90', b'\xF0\x90\x80',
902 b'\xF4', b'\xF4\x8F', b'\xF4\x8F\xBF'):
903 for suffix in b'\x7F', b'\xC0':
904 cases.append(prefix + suffix)
905 cases.extend((b'\xE0\x80', b'\xE0\x9F', b'\xED\xA0\x80',
906 b'\xED\xBF\xBF', b'\xF0\x80', b'\xF0\x8F', b'\xF4\x90'))
907
908 for data in cases:
909 with self.subTest(data=data):
910 dec = codecs.getincrementaldecoder(self.encoding)()
911 self.assertRaises(UnicodeDecodeError, dec.decode, data)
912
Victor Stinnerf96418d2015-09-21 23:06:27 +0200913
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200914class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000915 encoding = "utf-7"
916
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300917 def test_ascii(self):
918 # Set D (directly encoded characters)
919 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
920 'abcdefghijklmnopqrstuvwxyz'
921 '0123456789'
922 '\'(),-./:?')
923 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
924 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
925 # Set O (optional direct characters)
926 set_o = ' !"#$%&*;<=>@[]^_`{|}'
927 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
928 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
929 # +
930 self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
931 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
932 # White spaces
933 ws = ' \t\n\r'
934 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
935 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
936 # Other ASCII characters
937 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
938 set(set_d + set_o + '+' + ws)))
939 self.assertEqual(other_ascii.encode(self.encoding),
940 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
941 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
942
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000943 def test_partial(self):
944 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200945 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000946 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200947 'a',
948 'a',
949 'a+',
950 'a+-',
951 'a+-b',
952 'a+-b',
953 'a+-b',
954 'a+-b',
955 'a+-b',
956 'a+-b\x00',
957 'a+-b\x00c',
958 'a+-b\x00c',
959 'a+-b\x00c',
960 'a+-b\x00c',
961 'a+-b\x00c',
962 'a+-b\x00c\x80',
963 'a+-b\x00c\x80d',
964 'a+-b\x00c\x80d',
965 'a+-b\x00c\x80d',
966 'a+-b\x00c\x80d',
967 'a+-b\x00c\x80d',
968 'a+-b\x00c\x80d\u0100',
969 'a+-b\x00c\x80d\u0100e',
970 'a+-b\x00c\x80d\u0100e',
971 'a+-b\x00c\x80d\u0100e',
972 'a+-b\x00c\x80d\u0100e',
973 'a+-b\x00c\x80d\u0100e',
974 'a+-b\x00c\x80d\u0100e',
975 'a+-b\x00c\x80d\u0100e',
976 'a+-b\x00c\x80d\u0100e',
977 'a+-b\x00c\x80d\u0100e\U00010000',
978 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000979 ]
980 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000981
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300982 def test_errors(self):
983 tests = [
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300984 (b'\xffb', '\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300985 (b'a\xffb', 'a\ufffdb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300986 (b'a\xff\xffb', 'a\ufffd\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300987 (b'a+IK', 'a\ufffd'),
988 (b'a+IK-b', 'a\ufffdb'),
989 (b'a+IK,b', 'a\ufffdb'),
990 (b'a+IKx', 'a\u20ac\ufffd'),
991 (b'a+IKx-b', 'a\u20ac\ufffdb'),
992 (b'a+IKwgr', 'a\u20ac\ufffd'),
993 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
994 (b'a+IKwgr,', 'a\u20ac\ufffd'),
995 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
996 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
997 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
998 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
999 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
1000 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
1001 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001002 (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
1003 (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
Zackery Spytze349bf22018-08-18 22:43:38 -06001004 (b'a+@b', 'a\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001005 ]
1006 for raw, expected in tests:
1007 with self.subTest(raw=raw):
1008 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
1009 raw, 'strict', True)
1010 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
1011
1012 def test_nonbmp(self):
1013 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1014 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1015 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001016 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1017 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1018 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1019 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1020 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1021 b'+IKwgrNgB3KA-')
1022 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1023 '\u20ac\u20ac\U000104A0')
1024 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1025 '\u20ac\u20ac\U000104A0')
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001026
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001027 def test_lone_surrogates(self):
1028 tests = [
1029 (b'a+2AE-b', 'a\ud801b'),
1030 (b'a+2AE\xffb', 'a\ufffdb'),
1031 (b'a+2AE', 'a\ufffd'),
1032 (b'a+2AEA-b', 'a\ufffdb'),
1033 (b'a+2AH-b', 'a\ufffdb'),
1034 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1035 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1036 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1037 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1038 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1039 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1040 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1041 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1042 ]
1043 for raw, expected in tests:
1044 with self.subTest(raw=raw):
1045 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001046
1047
Walter Dörwalde22d3392005-11-17 08:52:34 +00001048class UTF16ExTest(unittest.TestCase):
1049
1050 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001051 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001052
1053 def test_bad_args(self):
1054 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1055
1056class ReadBufferTest(unittest.TestCase):
1057
1058 def test_array(self):
1059 import array
1060 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001061 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +00001062 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001063 )
1064
1065 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +00001066 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +00001067
1068 def test_bad_args(self):
1069 self.assertRaises(TypeError, codecs.readbuffer_encode)
1070 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1071
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001072class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001073 encoding = "utf-8-sig"
Victor Stinner01ada392015-10-01 21:54:51 +02001074 BOM = codecs.BOM_UTF8
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001075
1076 def test_partial(self):
1077 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001078 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001079 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001080 "",
1081 "",
1082 "", # First BOM has been read and skipped
1083 "",
1084 "",
1085 "\ufeff", # Second BOM has been read and emitted
1086 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001087 "\ufeff\x00", # First byte of encoded "\xff" read
1088 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1089 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1090 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001091 "\ufeff\x00\xff\u07ff",
1092 "\ufeff\x00\xff\u07ff",
1093 "\ufeff\x00\xff\u07ff\u0800",
1094 "\ufeff\x00\xff\u07ff\u0800",
1095 "\ufeff\x00\xff\u07ff\u0800",
1096 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001097 "\ufeff\x00\xff\u07ff\u0800\uffff",
1098 "\ufeff\x00\xff\u07ff\u0800\uffff",
1099 "\ufeff\x00\xff\u07ff\u0800\uffff",
1100 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001101 ]
1102 )
1103
Thomas Wouters89f507f2006-12-13 04:49:30 +00001104 def test_bug1601501(self):
1105 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001106 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001107
Walter Dörwald3abcb012007-04-16 22:10:50 +00001108 def test_bom(self):
1109 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001110 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001111 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1112
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001113 def test_stream_bom(self):
1114 unistring = "ABC\u00A1\u2200XYZ"
1115 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1116
1117 reader = codecs.getreader("utf-8-sig")
1118 for sizehint in [None] + list(range(1, 11)) + \
1119 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001120 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001121 ostream = io.StringIO()
1122 while 1:
1123 if sizehint is not None:
1124 data = istream.read(sizehint)
1125 else:
1126 data = istream.read()
1127
1128 if not data:
1129 break
1130 ostream.write(data)
1131
1132 got = ostream.getvalue()
1133 self.assertEqual(got, unistring)
1134
1135 def test_stream_bare(self):
1136 unistring = "ABC\u00A1\u2200XYZ"
1137 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1138
1139 reader = codecs.getreader("utf-8-sig")
1140 for sizehint in [None] + list(range(1, 11)) + \
1141 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001142 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001143 ostream = io.StringIO()
1144 while 1:
1145 if sizehint is not None:
1146 data = istream.read(sizehint)
1147 else:
1148 data = istream.read()
1149
1150 if not data:
1151 break
1152 ostream.write(data)
1153
1154 got = ostream.getvalue()
1155 self.assertEqual(got, unistring)
1156
1157class EscapeDecodeTest(unittest.TestCase):
1158 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001159 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001160 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001161
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001162 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001163 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001164 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001165 b = bytes([b])
1166 if b != b'\\':
1167 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001168
1169 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001170 decode = codecs.escape_decode
1171 check = coding_checker(self, decode)
1172 check(b"[\\\n]", b"[]")
1173 check(br'[\"]', b'["]')
1174 check(br"[\']", b"[']")
R David Murray110b6fe2016-09-08 15:34:08 -04001175 check(br"[\\]", b"[\\]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001176 check(br"[\a]", b"[\x07]")
1177 check(br"[\b]", b"[\x08]")
1178 check(br"[\t]", b"[\x09]")
1179 check(br"[\n]", b"[\x0a]")
1180 check(br"[\v]", b"[\x0b]")
1181 check(br"[\f]", b"[\x0c]")
1182 check(br"[\r]", b"[\x0d]")
1183 check(br"[\7]", b"[\x07]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001184 check(br"[\78]", b"[\x078]")
1185 check(br"[\41]", b"[!]")
1186 check(br"[\418]", b"[!8]")
1187 check(br"[\101]", b"[A]")
1188 check(br"[\1010]", b"[A0]")
1189 check(br"[\501]", b"[A]")
1190 check(br"[\x41]", b"[A]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001191 check(br"[\x410]", b"[A0]")
R David Murray110b6fe2016-09-08 15:34:08 -04001192 for i in range(97, 123):
1193 b = bytes([i])
1194 if b not in b'abfnrtvx':
1195 with self.assertWarns(DeprecationWarning):
1196 check(b"\\" + b, b"\\" + b)
1197 with self.assertWarns(DeprecationWarning):
1198 check(b"\\" + b.upper(), b"\\" + b.upper())
1199 with self.assertWarns(DeprecationWarning):
1200 check(br"\8", b"\\8")
1201 with self.assertWarns(DeprecationWarning):
1202 check(br"\9", b"\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03001203 with self.assertWarns(DeprecationWarning):
1204 check(b"\\\xfa", b"\\\xfa")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001205
1206 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001207 decode = codecs.escape_decode
1208 self.assertRaises(ValueError, decode, br"\x")
1209 self.assertRaises(ValueError, decode, br"[\x]")
1210 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1211 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1212 self.assertRaises(ValueError, decode, br"\x0")
1213 self.assertRaises(ValueError, decode, br"[\x0]")
1214 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1215 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001216
Victor Stinnerf96418d2015-09-21 23:06:27 +02001217
Martin v. Löwis2548c732003-04-18 10:39:54 +00001218# From RFC 3492
1219punycode_testcases = [
1220 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001221 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1222 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001223 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001224 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001225 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001226 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001227 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001228 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001229 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001230 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001231 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1232 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1233 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001234 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001235 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001236 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1237 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1238 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001239 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001240 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001241 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001242 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1243 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1244 "\u0939\u0948\u0902",
1245 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001246
1247 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001248 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001249 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1250 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001251
1252 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001253 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1254 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1255 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001256 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1257 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001258
1259 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001260 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1261 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1262 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1263 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001264 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001265
1266 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001267 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1268 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1269 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1270 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1271 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001272 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001273
1274 # (K) Vietnamese:
1275 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1276 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001277 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1278 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1279 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1280 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001281 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001282
Martin v. Löwis2548c732003-04-18 10:39:54 +00001283 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001284 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001285 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001286
Martin v. Löwis2548c732003-04-18 10:39:54 +00001287 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001288 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1289 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1290 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001291 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001292
1293 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001294 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1295 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1296 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001297 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001298
1299 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001300 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001301 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001302
1303 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001304 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1305 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001306 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001307
1308 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001309 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001310 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001311
1312 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001313 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001314 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001315
1316 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001317 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1318 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001319 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001320 ]
1321
1322for i in punycode_testcases:
1323 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001324 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001325
Victor Stinnerf96418d2015-09-21 23:06:27 +02001326
Martin v. Löwis2548c732003-04-18 10:39:54 +00001327class PunycodeTest(unittest.TestCase):
1328 def test_encode(self):
1329 for uni, puny in punycode_testcases:
1330 # Need to convert both strings to lower case, since
1331 # some of the extended encodings use upper case, but our
1332 # code produces only lower case. Converting just puny to
1333 # lower is also insufficient, since some of the input characters
1334 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001335 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001336 str(uni.encode("punycode"), "ascii").lower(),
1337 str(puny, "ascii").lower()
1338 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001339
1340 def test_decode(self):
1341 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001342 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001343 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001344 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001345
Berker Peksagba22e8f2020-02-25 06:19:03 +03001346 def test_decode_invalid(self):
1347 testcases = [
1348 (b"xn--w&", "strict", UnicodeError()),
1349 (b"xn--w&", "ignore", "xn-"),
1350 ]
1351 for puny, errors, expected in testcases:
1352 with self.subTest(puny=puny, errors=errors):
1353 if isinstance(expected, Exception):
1354 self.assertRaises(UnicodeError, puny.decode, "punycode", errors)
1355 else:
1356 self.assertEqual(puny.decode("punycode", errors), expected)
1357
Victor Stinnerf96418d2015-09-21 23:06:27 +02001358
Martin v. Löwis2548c732003-04-18 10:39:54 +00001359# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1360nameprep_tests = [
1361 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001362 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1363 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1364 b'\xb8\x8f\xef\xbb\xbf',
1365 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001366 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001367 (b'CAFE',
1368 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001369 # 3.3 Case folding 8bit U+00DF (german sharp s).
1370 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001371 (b'\xc3\x9f',
1372 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001373 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001374 (b'\xc4\xb0',
1375 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001376 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001377 (b'\xc5\x83\xcd\xba',
1378 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001379 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1380 # XXX: skip this as it fails in UCS-2 mode
1381 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1382 # 'telc\xe2\x88\x95kg\xcf\x83'),
1383 (None, None),
1384 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001385 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1386 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001387 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001388 (b'\xe1\xbe\xb7',
1389 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001390 # 3.9 Self-reverting case folding U+01F0 and normalization.
1391 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001392 (b'\xc7\xb0',
1393 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001394 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001395 (b'\xce\x90',
1396 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001397 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001398 (b'\xce\xb0',
1399 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001400 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001401 (b'\xe1\xba\x96',
1402 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001403 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001404 (b'\xe1\xbd\x96',
1405 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001406 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001407 (b' ',
1408 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001409 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001410 (b'\xc2\xa0',
1411 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001412 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001413 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001414 None),
1415 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001416 (b'\xe2\x80\x80',
1417 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001418 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001419 (b'\xe2\x80\x8b',
1420 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001421 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001422 (b'\xe3\x80\x80',
1423 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001424 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001425 (b'\x10\x7f',
1426 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001427 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001428 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001429 None),
1430 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001431 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001432 None),
1433 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001434 (b'\xef\xbb\xbf',
1435 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001436 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001437 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001438 None),
1439 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001440 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001441 None),
1442 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001443 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001444 None),
1445 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001446 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001447 None),
1448 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001449 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001450 None),
1451 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001452 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001453 None),
1454 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001455 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001456 None),
1457 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001458 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001459 None),
1460 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001461 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001462 None),
1463 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001464 (b'\xcd\x81',
1465 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001466 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001467 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001468 None),
1469 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001470 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001471 None),
1472 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001473 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001474 None),
1475 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001476 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001477 None),
1478 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001479 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001480 None),
1481 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001482 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001483 None),
1484 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001485 (b'foo\xef\xb9\xb6bar',
1486 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001487 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001488 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001489 None),
1490 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001491 (b'\xd8\xa71\xd8\xa8',
1492 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001493 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001494 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001495 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001496 # None),
1497 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001498 # 3.44 Larger test (shrinking).
1499 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001500 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1501 b'\xaa\xce\xb0\xe2\x80\x80',
1502 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001503 # 3.45 Larger test (expanding).
1504 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001505 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1506 b'\x80',
1507 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1508 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1509 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001510 ]
1511
1512
1513class NameprepTest(unittest.TestCase):
1514 def test_nameprep(self):
1515 from encodings.idna import nameprep
1516 for pos, (orig, prepped) in enumerate(nameprep_tests):
1517 if orig is None:
1518 # Skipped
1519 continue
1520 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001521 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001522 if prepped is None:
1523 # Input contains prohibited characters
1524 self.assertRaises(UnicodeError, nameprep, orig)
1525 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001526 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001527 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001528 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001529 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001530 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001531
Victor Stinnerf96418d2015-09-21 23:06:27 +02001532
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001533class IDNACodecTest(unittest.TestCase):
1534 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001535 self.assertEqual(str(b"python.org", "idna"), "python.org")
1536 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1537 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1538 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001539
1540 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001541 self.assertEqual("python.org".encode("idna"), b"python.org")
1542 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1543 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1544 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001545
Martin v. Löwis8b595142005-08-25 11:03:38 +00001546 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001547 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001548 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001549 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001550
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001551 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001552 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001553 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001554 "python.org"
1555 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001556 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001557 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001558 "python.org."
1559 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001560 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001561 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001562 "pyth\xf6n.org."
1563 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001564 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001565 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001566 "pyth\xf6n.org."
1567 )
1568
1569 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001570 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1571 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1572 self.assertEqual(decoder.decode(b"rg"), "")
1573 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001574
1575 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001576 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1577 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1578 self.assertEqual(decoder.decode(b"rg."), "org.")
1579 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001580
1581 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001582 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001583 b"".join(codecs.iterencode("python.org", "idna")),
1584 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001585 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001586 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001587 b"".join(codecs.iterencode("python.org.", "idna")),
1588 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001589 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001590 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001591 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1592 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001593 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001594 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001595 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1596 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001597 )
1598
1599 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001600 self.assertEqual(encoder.encode("\xe4x"), b"")
1601 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1602 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001603
1604 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001605 self.assertEqual(encoder.encode("\xe4x"), b"")
1606 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1607 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001608
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001609 def test_errors(self):
1610 """Only supports "strict" error handler"""
1611 "python.org".encode("idna", "strict")
1612 b"python.org".decode("idna", "strict")
1613 for errors in ("ignore", "replace", "backslashreplace",
1614 "surrogateescape"):
1615 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1616 self.assertRaises(Exception,
1617 b"python.org".decode, "idna", errors)
1618
Victor Stinnerf96418d2015-09-21 23:06:27 +02001619
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001620class CodecsModuleTest(unittest.TestCase):
1621
1622 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001623 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1624 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001625 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001626 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001627 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001628
Victor Stinnera57dfd02014-05-14 17:13:14 +02001629 # test keywords
1630 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1631 '\xe4\xf6\xfc')
1632 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1633 '[]')
1634
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001635 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001636 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1637 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001638 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001639 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001640 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001641 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001642
Victor Stinnera57dfd02014-05-14 17:13:14 +02001643 # test keywords
1644 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1645 b'\xe4\xf6\xfc')
1646 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1647 b'[]')
1648
Walter Dörwald063e1e82004-10-28 13:04:26 +00001649 def test_register(self):
1650 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001651 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001652
1653 def test_lookup(self):
1654 self.assertRaises(TypeError, codecs.lookup)
1655 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001656 self.assertRaises(LookupError, codecs.lookup, " ")
1657
1658 def test_getencoder(self):
1659 self.assertRaises(TypeError, codecs.getencoder)
1660 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1661
1662 def test_getdecoder(self):
1663 self.assertRaises(TypeError, codecs.getdecoder)
1664 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1665
1666 def test_getreader(self):
1667 self.assertRaises(TypeError, codecs.getreader)
1668 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1669
1670 def test_getwriter(self):
1671 self.assertRaises(TypeError, codecs.getwriter)
1672 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001673
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001674 def test_lookup_issue1813(self):
1675 # Issue #1813: under Turkish locales, lookup of some codecs failed
1676 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001677 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001678 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1679 try:
1680 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1681 except locale.Error:
1682 # Unsupported locale on this system
1683 self.skipTest('test needs Turkish locale')
1684 c = codecs.lookup('ASCII')
1685 self.assertEqual(c.name, 'ascii')
1686
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001687 def test_all(self):
1688 api = (
1689 "encode", "decode",
1690 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1691 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1692 "getencoder", "getdecoder", "getincrementalencoder",
1693 "getincrementaldecoder", "getreader", "getwriter",
1694 "register_error", "lookup_error",
1695 "strict_errors", "replace_errors", "ignore_errors",
1696 "xmlcharrefreplace_errors", "backslashreplace_errors",
1697 "namereplace_errors",
1698 "open", "EncodedFile",
1699 "iterencode", "iterdecode",
1700 "BOM", "BOM_BE", "BOM_LE",
1701 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1702 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1703 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1704 "StreamReaderWriter", "StreamRecoder",
1705 )
1706 self.assertCountEqual(api, codecs.__all__)
1707 for api in codecs.__all__:
1708 getattr(codecs, api)
1709
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001710 def test_open(self):
1711 self.addCleanup(support.unlink, support.TESTFN)
1712 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1713 with self.subTest(mode), \
1714 codecs.open(support.TESTFN, mode, 'ascii') as file:
1715 self.assertIsInstance(file, codecs.StreamReaderWriter)
1716
1717 def test_undefined(self):
1718 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1719 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1720 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1721 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1722 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1723 self.assertRaises(UnicodeError,
1724 codecs.encode, 'abc', 'undefined', errors)
1725 self.assertRaises(UnicodeError,
1726 codecs.decode, b'abc', 'undefined', errors)
1727
Victor Stinnerf96418d2015-09-21 23:06:27 +02001728
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001729class StreamReaderTest(unittest.TestCase):
1730
1731 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001732 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001733 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001734
1735 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001736 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001737 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001738
Victor Stinnerf96418d2015-09-21 23:06:27 +02001739
Thomas Wouters89f507f2006-12-13 04:49:30 +00001740class EncodedFileTest(unittest.TestCase):
1741
1742 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001743 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001744 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001745 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001746
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001747 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001748 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001749 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001750 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001751
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001752all_unicode_encodings = [
1753 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001754 "big5",
1755 "big5hkscs",
1756 "charmap",
1757 "cp037",
1758 "cp1006",
1759 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001760 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001761 "cp1140",
1762 "cp1250",
1763 "cp1251",
1764 "cp1252",
1765 "cp1253",
1766 "cp1254",
1767 "cp1255",
1768 "cp1256",
1769 "cp1257",
1770 "cp1258",
1771 "cp424",
1772 "cp437",
1773 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001774 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001775 "cp737",
1776 "cp775",
1777 "cp850",
1778 "cp852",
1779 "cp855",
1780 "cp856",
1781 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001782 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001783 "cp860",
1784 "cp861",
1785 "cp862",
1786 "cp863",
1787 "cp864",
1788 "cp865",
1789 "cp866",
1790 "cp869",
1791 "cp874",
1792 "cp875",
1793 "cp932",
1794 "cp949",
1795 "cp950",
1796 "euc_jis_2004",
1797 "euc_jisx0213",
1798 "euc_jp",
1799 "euc_kr",
1800 "gb18030",
1801 "gb2312",
1802 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001803 "hp_roman8",
1804 "hz",
1805 "idna",
1806 "iso2022_jp",
1807 "iso2022_jp_1",
1808 "iso2022_jp_2",
1809 "iso2022_jp_2004",
1810 "iso2022_jp_3",
1811 "iso2022_jp_ext",
1812 "iso2022_kr",
1813 "iso8859_1",
1814 "iso8859_10",
1815 "iso8859_11",
1816 "iso8859_13",
1817 "iso8859_14",
1818 "iso8859_15",
1819 "iso8859_16",
1820 "iso8859_2",
1821 "iso8859_3",
1822 "iso8859_4",
1823 "iso8859_5",
1824 "iso8859_6",
1825 "iso8859_7",
1826 "iso8859_8",
1827 "iso8859_9",
1828 "johab",
1829 "koi8_r",
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03001830 "koi8_t",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001831 "koi8_u",
Serhiy Storchakaad8a1c32015-05-12 23:16:55 +03001832 "kz1048",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001833 "latin_1",
1834 "mac_cyrillic",
1835 "mac_greek",
1836 "mac_iceland",
1837 "mac_latin2",
1838 "mac_roman",
1839 "mac_turkish",
1840 "palmos",
1841 "ptcp154",
1842 "punycode",
1843 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001844 "shift_jis",
1845 "shift_jis_2004",
1846 "shift_jisx0213",
1847 "tis_620",
1848 "unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001849 "utf_16",
1850 "utf_16_be",
1851 "utf_16_le",
1852 "utf_7",
1853 "utf_8",
1854]
1855
1856if hasattr(codecs, "mbcs_encode"):
1857 all_unicode_encodings.append("mbcs")
Steve Dowerf5aba582016-09-06 19:42:27 -07001858if hasattr(codecs, "oem_encode"):
1859 all_unicode_encodings.append("oem")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001860
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001861# The following encoding is not tested, because it's not supposed
1862# to work:
1863# "undefined"
1864
1865# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001866broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001867 "punycode",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001868]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001869
Victor Stinnerf96418d2015-09-21 23:06:27 +02001870
Walter Dörwald3abcb012007-04-16 22:10:50 +00001871class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001872 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001873 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001874 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001875 name = codecs.lookup(encoding).name
1876 if encoding.endswith("_codec"):
1877 name += "_codec"
1878 elif encoding == "latin_1":
1879 name = "latin_1"
1880 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001881
Inada Naoki6a16b182019-03-18 15:44:11 +09001882 (b, size) = codecs.getencoder(encoding)(s)
1883 self.assertEqual(size, len(s), "encoding=%r" % encoding)
1884 (chars, size) = codecs.getdecoder(encoding)(b)
1885 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001886
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001887 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001888 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001889 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001890 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001891 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001892 for c in s:
1893 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001894 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001895 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001896 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001897 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001898 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001899 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001900 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001901 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001902 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001903 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001904
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001905 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001906 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001907 try:
1908 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001909 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001910 pass
1911 else:
1912 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001913 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001914 for c in s:
1915 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001916 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001917 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001918 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001919 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001920 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001921 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001922 self.assertEqual(decodedresult, s,
1923 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001924
1925 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001926 result = "".join(codecs.iterdecode(
1927 codecs.iterencode(s, encoding), encoding))
1928 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001929
1930 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001931 result = "".join(codecs.iterdecode(
1932 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001933 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001934
Victor Stinner554f3f02010-06-16 23:33:54 +00001935 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001936 # check incremental decoder/encoder with errors argument
1937 try:
1938 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001939 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001940 pass
1941 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001942 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001943 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001944 decodedresult = "".join(decoder.decode(bytes([c]))
1945 for c in encodedresult)
1946 self.assertEqual(decodedresult, s,
1947 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001948
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001949 @support.cpython_only
1950 def test_basics_capi(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001951 s = "abc123" # all codecs should be able to encode these
1952 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001953 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001954 # check incremental decoder/encoder (fetched via the C API)
1955 try:
Victor Stinner3d4226a2018-08-29 22:21:32 +02001956 cencoder = _testcapi.codec_incrementalencoder(encoding)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001957 except LookupError: # no IncrementalEncoder
1958 pass
1959 else:
1960 # check C API
1961 encodedresult = b""
1962 for c in s:
1963 encodedresult += cencoder.encode(c)
1964 encodedresult += cencoder.encode("", True)
Victor Stinner3d4226a2018-08-29 22:21:32 +02001965 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001966 decodedresult = ""
1967 for c in encodedresult:
1968 decodedresult += cdecoder.decode(bytes([c]))
1969 decodedresult += cdecoder.decode(b"", True)
1970 self.assertEqual(decodedresult, s,
1971 "encoding=%r" % encoding)
1972
1973 if encoding not in ("idna", "mbcs"):
1974 # check incremental decoder/encoder with errors argument
1975 try:
Victor Stinner3d4226a2018-08-29 22:21:32 +02001976 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001977 except LookupError: # no IncrementalEncoder
1978 pass
1979 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001980 encodedresult = b"".join(cencoder.encode(c) for c in s)
Victor Stinner3d4226a2018-08-29 22:21:32 +02001981 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001982 decodedresult = "".join(cdecoder.decode(bytes([c]))
1983 for c in encodedresult)
1984 self.assertEqual(decodedresult, s,
1985 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001986
Walter Dörwald729c31f2005-03-14 19:06:30 +00001987 def test_seek(self):
1988 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001989 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001990 for encoding in all_unicode_encodings:
1991 if encoding == "idna": # FIXME: See SF bug #1163178
1992 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001993 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00001994 continue
Victor Stinner05010702011-05-27 16:50:40 +02001995 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001996 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001997 # Test that calling seek resets the internal codec state and buffers
1998 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001999 data = reader.read()
2000 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00002001
Walter Dörwalde22d3392005-11-17 08:52:34 +00002002 def test_bad_decode_args(self):
2003 for encoding in all_unicode_encodings:
2004 decoder = codecs.getdecoder(encoding)
2005 self.assertRaises(TypeError, decoder)
2006 if encoding not in ("idna", "punycode"):
2007 self.assertRaises(TypeError, decoder, 42)
2008
2009 def test_bad_encode_args(self):
2010 for encoding in all_unicode_encodings:
2011 encoder = codecs.getencoder(encoding)
Inada Naoki6a16b182019-03-18 15:44:11 +09002012 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002013
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002014 def test_encoding_map_type_initialized(self):
2015 from encodings import cp1140
2016 # This used to crash, we are only verifying there's no crash.
2017 table_type = type(cp1140.encoding_table)
2018 self.assertEqual(table_type, table_type)
2019
Walter Dörwald3abcb012007-04-16 22:10:50 +00002020 def test_decoder_state(self):
2021 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002022 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002023 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002024 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002025 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2026 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2027
Victor Stinnerf96418d2015-09-21 23:06:27 +02002028
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002029class CharmapTest(unittest.TestCase):
2030 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002031 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002032 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002033 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002034 )
2035
Ezio Melottib3aedd42010-11-20 19:04:17 +00002036 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002037 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2038 ("\U0010FFFFbc", 3)
2039 )
2040
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002041 self.assertRaises(UnicodeDecodeError,
2042 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2043 )
2044
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002045 self.assertRaises(UnicodeDecodeError,
2046 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2047 )
2048
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002049 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002050 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002051 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002052 )
2053
Ezio Melottib3aedd42010-11-20 19:04:17 +00002054 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002055 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002056 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002057 )
2058
Ezio Melottib3aedd42010-11-20 19:04:17 +00002059 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002060 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2061 ("ab\\x02", 3)
2062 )
2063
2064 self.assertEqual(
2065 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2066 ("ab\\x02", 3)
2067 )
2068
2069 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002070 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002071 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002072 )
2073
Ezio Melottib3aedd42010-11-20 19:04:17 +00002074 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002075 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002076 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002077 )
2078
Guido van Rossum805365e2007-05-07 22:24:25 +00002079 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002080 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002081 codecs.charmap_decode(allbytes, "ignore", ""),
2082 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002083 )
2084
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002085 def test_decode_with_int2str_map(self):
2086 self.assertEqual(
2087 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2088 {0: 'a', 1: 'b', 2: 'c'}),
2089 ("abc", 3)
2090 )
2091
2092 self.assertEqual(
2093 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2094 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2095 ("AaBbCc", 3)
2096 )
2097
2098 self.assertEqual(
2099 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2100 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2101 ("\U0010FFFFbc", 3)
2102 )
2103
2104 self.assertEqual(
2105 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2106 {0: 'a', 1: 'b', 2: ''}),
2107 ("ab", 3)
2108 )
2109
2110 self.assertRaises(UnicodeDecodeError,
2111 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2112 {0: 'a', 1: 'b'}
2113 )
2114
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002115 self.assertRaises(UnicodeDecodeError,
2116 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2117 {0: 'a', 1: 'b', 2: None}
2118 )
2119
2120 # Issue #14850
2121 self.assertRaises(UnicodeDecodeError,
2122 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2123 {0: 'a', 1: 'b', 2: '\ufffe'}
2124 )
2125
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002126 self.assertEqual(
2127 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2128 {0: 'a', 1: 'b'}),
2129 ("ab\ufffd", 3)
2130 )
2131
2132 self.assertEqual(
2133 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2134 {0: 'a', 1: 'b', 2: None}),
2135 ("ab\ufffd", 3)
2136 )
2137
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002138 # Issue #14850
2139 self.assertEqual(
2140 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2141 {0: 'a', 1: 'b', 2: '\ufffe'}),
2142 ("ab\ufffd", 3)
2143 )
2144
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002145 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002146 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2147 {0: 'a', 1: 'b'}),
2148 ("ab\\x02", 3)
2149 )
2150
2151 self.assertEqual(
2152 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2153 {0: 'a', 1: 'b', 2: None}),
2154 ("ab\\x02", 3)
2155 )
2156
2157 # Issue #14850
2158 self.assertEqual(
2159 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2160 {0: 'a', 1: 'b', 2: '\ufffe'}),
2161 ("ab\\x02", 3)
2162 )
2163
2164 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002165 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2166 {0: 'a', 1: 'b'}),
2167 ("ab", 3)
2168 )
2169
2170 self.assertEqual(
2171 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2172 {0: 'a', 1: 'b', 2: None}),
2173 ("ab", 3)
2174 )
2175
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002176 # Issue #14850
2177 self.assertEqual(
2178 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2179 {0: 'a', 1: 'b', 2: '\ufffe'}),
2180 ("ab", 3)
2181 )
2182
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002183 allbytes = bytes(range(256))
2184 self.assertEqual(
2185 codecs.charmap_decode(allbytes, "ignore", {}),
2186 ("", len(allbytes))
2187 )
2188
2189 def test_decode_with_int2int_map(self):
2190 a = ord('a')
2191 b = ord('b')
2192 c = ord('c')
2193
2194 self.assertEqual(
2195 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2196 {0: a, 1: b, 2: c}),
2197 ("abc", 3)
2198 )
2199
2200 # Issue #15379
2201 self.assertEqual(
2202 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2203 {0: 0x10FFFF, 1: b, 2: c}),
2204 ("\U0010FFFFbc", 3)
2205 )
2206
Antoine Pitroua1f76552012-09-23 20:00:04 +02002207 self.assertEqual(
2208 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2209 {0: sys.maxunicode, 1: b, 2: c}),
2210 (chr(sys.maxunicode) + "bc", 3)
2211 )
2212
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002213 self.assertRaises(TypeError,
2214 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002215 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002216 )
2217
2218 self.assertRaises(UnicodeDecodeError,
2219 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2220 {0: a, 1: b},
2221 )
2222
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002223 self.assertRaises(UnicodeDecodeError,
2224 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2225 {0: a, 1: b, 2: 0xFFFE},
2226 )
2227
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002228 self.assertEqual(
2229 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2230 {0: a, 1: b}),
2231 ("ab\ufffd", 3)
2232 )
2233
2234 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002235 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2236 {0: a, 1: b, 2: 0xFFFE}),
2237 ("ab\ufffd", 3)
2238 )
2239
2240 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002241 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2242 {0: a, 1: b}),
2243 ("ab\\x02", 3)
2244 )
2245
2246 self.assertEqual(
2247 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2248 {0: a, 1: b, 2: 0xFFFE}),
2249 ("ab\\x02", 3)
2250 )
2251
2252 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002253 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2254 {0: a, 1: b}),
2255 ("ab", 3)
2256 )
2257
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002258 self.assertEqual(
2259 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2260 {0: a, 1: b, 2: 0xFFFE}),
2261 ("ab", 3)
2262 )
2263
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002264
Thomas Wouters89f507f2006-12-13 04:49:30 +00002265class WithStmtTest(unittest.TestCase):
2266 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002267 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002268 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2269 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002270 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002271
2272 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002273 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002274 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002275 with codecs.StreamReaderWriter(f, info.streamreader,
2276 info.streamwriter, 'strict') as srw:
2277 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002278
Victor Stinnerf96418d2015-09-21 23:06:27 +02002279
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002280class TypesTest(unittest.TestCase):
2281 def test_decode_unicode(self):
2282 # Most decoders don't accept unicode input
2283 decoders = [
2284 codecs.utf_7_decode,
2285 codecs.utf_8_decode,
2286 codecs.utf_16_le_decode,
2287 codecs.utf_16_be_decode,
2288 codecs.utf_16_ex_decode,
2289 codecs.utf_32_decode,
2290 codecs.utf_32_le_decode,
2291 codecs.utf_32_be_decode,
2292 codecs.utf_32_ex_decode,
2293 codecs.latin_1_decode,
2294 codecs.ascii_decode,
2295 codecs.charmap_decode,
2296 ]
2297 if hasattr(codecs, "mbcs_decode"):
2298 decoders.append(codecs.mbcs_decode)
2299 for decoder in decoders:
2300 self.assertRaises(TypeError, decoder, "xxx")
2301
2302 def test_unicode_escape(self):
Martin Panter119e5022016-04-16 09:28:57 +00002303 # Escape-decoding a unicode string is supported and gives the same
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002304 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002305 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2306 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2307 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2308 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002309
Victor Stinnere3b47152011-12-09 20:49:49 +01002310 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2311 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002312 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2313 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002314
2315 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2316 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002317 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2318 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002319
Serhiy Storchakad6793772013-01-29 10:20:44 +02002320
2321class UnicodeEscapeTest(unittest.TestCase):
2322 def test_empty(self):
2323 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2324 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2325
2326 def test_raw_encode(self):
2327 encode = codecs.unicode_escape_encode
2328 for b in range(32, 127):
2329 if b != b'\\'[0]:
2330 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2331
2332 def test_raw_decode(self):
2333 decode = codecs.unicode_escape_decode
2334 for b in range(256):
2335 if b != b'\\'[0]:
2336 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2337
2338 def test_escape_encode(self):
2339 encode = codecs.unicode_escape_encode
2340 check = coding_checker(self, encode)
2341 check('\t', br'\t')
2342 check('\n', br'\n')
2343 check('\r', br'\r')
2344 check('\\', br'\\')
2345 for b in range(32):
2346 if chr(b) not in '\t\n\r':
2347 check(chr(b), ('\\x%02x' % b).encode())
2348 for b in range(127, 256):
2349 check(chr(b), ('\\x%02x' % b).encode())
2350 check('\u20ac', br'\u20ac')
2351 check('\U0001d120', br'\U0001d120')
2352
2353 def test_escape_decode(self):
2354 decode = codecs.unicode_escape_decode
2355 check = coding_checker(self, decode)
2356 check(b"[\\\n]", "[]")
2357 check(br'[\"]', '["]')
2358 check(br"[\']", "[']")
2359 check(br"[\\]", r"[\]")
2360 check(br"[\a]", "[\x07]")
2361 check(br"[\b]", "[\x08]")
2362 check(br"[\t]", "[\x09]")
2363 check(br"[\n]", "[\x0a]")
2364 check(br"[\v]", "[\x0b]")
2365 check(br"[\f]", "[\x0c]")
2366 check(br"[\r]", "[\x0d]")
2367 check(br"[\7]", "[\x07]")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002368 check(br"[\78]", "[\x078]")
2369 check(br"[\41]", "[!]")
2370 check(br"[\418]", "[!8]")
2371 check(br"[\101]", "[A]")
2372 check(br"[\1010]", "[A0]")
2373 check(br"[\x41]", "[A]")
2374 check(br"[\x410]", "[A0]")
2375 check(br"\u20ac", "\u20ac")
2376 check(br"\U0001d120", "\U0001d120")
R David Murray110b6fe2016-09-08 15:34:08 -04002377 for i in range(97, 123):
2378 b = bytes([i])
2379 if b not in b'abfnrtuvx':
2380 with self.assertWarns(DeprecationWarning):
2381 check(b"\\" + b, "\\" + chr(i))
2382 if b.upper() not in b'UN':
2383 with self.assertWarns(DeprecationWarning):
2384 check(b"\\" + b.upper(), "\\" + chr(i-32))
2385 with self.assertWarns(DeprecationWarning):
2386 check(br"\8", "\\8")
2387 with self.assertWarns(DeprecationWarning):
2388 check(br"\9", "\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03002389 with self.assertWarns(DeprecationWarning):
2390 check(b"\\\xfa", "\\\xfa")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002391
2392 def test_decode_errors(self):
2393 decode = codecs.unicode_escape_decode
2394 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2395 for i in range(d):
2396 self.assertRaises(UnicodeDecodeError, decode,
2397 b"\\" + c + b"0"*i)
2398 self.assertRaises(UnicodeDecodeError, decode,
2399 b"[\\" + c + b"0"*i + b"]")
2400 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2401 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2402 self.assertEqual(decode(data, "replace"),
2403 ("[\ufffd]\ufffd", len(data)))
2404 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2405 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2406 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2407
2408
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002409class RawUnicodeEscapeTest(unittest.TestCase):
2410 def test_empty(self):
2411 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2412 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2413
2414 def test_raw_encode(self):
2415 encode = codecs.raw_unicode_escape_encode
2416 for b in range(256):
2417 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2418
2419 def test_raw_decode(self):
2420 decode = codecs.raw_unicode_escape_decode
2421 for b in range(256):
2422 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2423
2424 def test_escape_encode(self):
2425 encode = codecs.raw_unicode_escape_encode
2426 check = coding_checker(self, encode)
2427 for b in range(256):
2428 if b not in b'uU':
2429 check('\\' + chr(b), b'\\' + bytes([b]))
2430 check('\u20ac', br'\u20ac')
2431 check('\U0001d120', br'\U0001d120')
2432
2433 def test_escape_decode(self):
2434 decode = codecs.raw_unicode_escape_decode
2435 check = coding_checker(self, decode)
2436 for b in range(256):
2437 if b not in b'uU':
2438 check(b'\\' + bytes([b]), '\\' + chr(b))
2439 check(br"\u20ac", "\u20ac")
2440 check(br"\U0001d120", "\U0001d120")
2441
2442 def test_decode_errors(self):
2443 decode = codecs.raw_unicode_escape_decode
2444 for c, d in (b'u', 4), (b'U', 4):
2445 for i in range(d):
2446 self.assertRaises(UnicodeDecodeError, decode,
2447 b"\\" + c + b"0"*i)
2448 self.assertRaises(UnicodeDecodeError, decode,
2449 b"[\\" + c + b"0"*i + b"]")
2450 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2451 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2452 self.assertEqual(decode(data, "replace"),
2453 ("[\ufffd]\ufffd", len(data)))
2454 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2455 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2456 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2457
2458
Berker Peksag4a72a7b2016-09-16 17:31:06 +03002459class EscapeEncodeTest(unittest.TestCase):
2460
2461 def test_escape_encode(self):
2462 tests = [
2463 (b'', (b'', 0)),
2464 (b'foobar', (b'foobar', 6)),
2465 (b'spam\0eggs', (b'spam\\x00eggs', 9)),
2466 (b'a\'b', (b"a\\'b", 3)),
2467 (b'b\\c', (b'b\\\\c', 3)),
2468 (b'c\nd', (b'c\\nd', 3)),
2469 (b'd\re', (b'd\\re', 3)),
2470 (b'f\x7fg', (b'f\\x7fg', 3)),
2471 ]
2472 for data, output in tests:
2473 with self.subTest(data=data):
2474 self.assertEqual(codecs.escape_encode(data), output)
2475 self.assertRaises(TypeError, codecs.escape_encode, 'spam')
2476 self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam'))
2477
2478
Martin v. Löwis43c57782009-05-10 08:15:24 +00002479class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002480
2481 def test_utf8(self):
2482 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002483 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002484 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002485 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002486 b"foo\x80bar")
2487 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002488 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002489 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002490 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002491 b"\xed\xb0\x80")
2492
2493 def test_ascii(self):
2494 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002495 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002496 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002497 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002498 b"foo\x80bar")
2499
2500 def test_charmap(self):
2501 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002502 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002503 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002504 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002505 b"foo\xa5bar")
2506
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002507 def test_latin1(self):
2508 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002509 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002510 b"\xe4\xeb\xef\xf6\xfc")
2511
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002512
Victor Stinner3fed0872010-05-22 02:16:27 +00002513class BomTest(unittest.TestCase):
2514 def test_seek0(self):
2515 data = "1234567890"
2516 tests = ("utf-16",
2517 "utf-16-le",
2518 "utf-16-be",
2519 "utf-32",
2520 "utf-32-le",
2521 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002522 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002523 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002524 # Check if the BOM is written only once
2525 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002526 f.write(data)
2527 f.write(data)
2528 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002529 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002530 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002531 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002532
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002533 # Check that the BOM is written after a seek(0)
2534 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2535 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002536 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002537 f.seek(0)
2538 f.write(data)
2539 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002540 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002541
2542 # (StreamWriter) Check that the BOM is written after a seek(0)
2543 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002544 f.writer.write(data[0])
2545 self.assertNotEqual(f.writer.tell(), 0)
2546 f.writer.seek(0)
2547 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002548 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002549 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002550
Victor Stinner05010702011-05-27 16:50:40 +02002551 # Check that the BOM is not written after a seek() at a position
2552 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002553 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2554 f.write(data)
2555 f.seek(f.tell())
2556 f.write(data)
2557 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002558 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002559
Victor Stinner05010702011-05-27 16:50:40 +02002560 # (StreamWriter) Check that the BOM is not written after a seek()
2561 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002562 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002563 f.writer.write(data)
2564 f.writer.seek(f.writer.tell())
2565 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002566 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002567 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002568
Victor Stinner3fed0872010-05-22 02:16:27 +00002569
Georg Brandl02524622010-12-02 18:06:51 +00002570bytes_transform_encodings = [
2571 "base64_codec",
2572 "uu_codec",
2573 "quopri_codec",
2574 "hex_codec",
2575]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002576
2577transform_aliases = {
2578 "base64_codec": ["base64", "base_64"],
2579 "uu_codec": ["uu"],
2580 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2581 "hex_codec": ["hex"],
2582 "rot_13": ["rot13"],
2583}
2584
Georg Brandl02524622010-12-02 18:06:51 +00002585try:
2586 import zlib
2587except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002588 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002589else:
2590 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002591 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002592try:
2593 import bz2
2594except ImportError:
2595 pass
2596else:
2597 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002598 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002599
Victor Stinnerf96418d2015-09-21 23:06:27 +02002600
Georg Brandl02524622010-12-02 18:06:51 +00002601class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002602
Georg Brandl02524622010-12-02 18:06:51 +00002603 def test_basics(self):
2604 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002605 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002606 with self.subTest(encoding=encoding):
2607 # generic codecs interface
2608 (o, size) = codecs.getencoder(encoding)(binput)
2609 self.assertEqual(size, len(binput))
2610 (i, size) = codecs.getdecoder(encoding)(o)
2611 self.assertEqual(size, len(o))
2612 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002613
Georg Brandl02524622010-12-02 18:06:51 +00002614 def test_read(self):
2615 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002616 with self.subTest(encoding=encoding):
2617 sin = codecs.encode(b"\x80", encoding)
2618 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2619 sout = reader.read()
2620 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002621
2622 def test_readline(self):
2623 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002624 with self.subTest(encoding=encoding):
2625 sin = codecs.encode(b"\x80", encoding)
2626 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2627 sout = reader.readline()
2628 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002629
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002630 def test_buffer_api_usage(self):
2631 # We check all the transform codecs accept memoryview input
2632 # for encoding and decoding
2633 # and also that they roundtrip correctly
2634 original = b"12345\x80"
2635 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002636 with self.subTest(encoding=encoding):
2637 data = original
2638 view = memoryview(data)
2639 data = codecs.encode(data, encoding)
2640 view_encoded = codecs.encode(view, encoding)
2641 self.assertEqual(view_encoded, data)
2642 view = memoryview(data)
2643 data = codecs.decode(data, encoding)
2644 self.assertEqual(data, original)
2645 view_decoded = codecs.decode(view, encoding)
2646 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002647
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002648 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002649 # Check binary -> binary codecs give a good error for str input
2650 bad_input = "bad input type"
2651 for encoding in bytes_transform_encodings:
2652 with self.subTest(encoding=encoding):
R David Murray44b548d2016-09-08 13:59:53 -04002653 fmt = (r"{!r} is not a text encoding; "
2654 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002655 msg = fmt.format(encoding)
2656 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002657 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002658 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002659
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002660 def test_text_to_binary_blacklists_text_transforms(self):
2661 # Check str.encode gives a good error message for str -> str codecs
2662 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002663 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002664 with self.assertRaisesRegex(LookupError, msg):
2665 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002666
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002667 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002668 # Check bytes.decode and bytearray.decode give a good error
2669 # message for binary -> binary codecs
2670 data = b"encode first to ensure we meet any format restrictions"
2671 for encoding in bytes_transform_encodings:
2672 with self.subTest(encoding=encoding):
2673 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002674 fmt = (r"{!r} is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002675 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002676 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002677 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002678 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002679 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002680 bytearray(encoded_data).decode(encoding)
2681
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002682 def test_binary_to_text_blacklists_text_transforms(self):
2683 # Check str -> str codec gives a good error for binary input
2684 for bad_input in (b"immutable", bytearray(b"mutable")):
2685 with self.subTest(bad_input=bad_input):
2686 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002687 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002688 with self.assertRaisesRegex(LookupError, msg) as failure:
2689 bad_input.decode("rot_13")
2690 self.assertIsNone(failure.exception.__cause__)
2691
Zachary Wareefa2e042013-12-30 14:54:11 -06002692 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002693 def test_custom_zlib_error_is_wrapped(self):
2694 # Check zlib codec gives a good error for malformed input
2695 msg = "^decoding with 'zlib_codec' codec failed"
2696 with self.assertRaisesRegex(Exception, msg) as failure:
2697 codecs.decode(b"hello", "zlib_codec")
2698 self.assertIsInstance(failure.exception.__cause__,
2699 type(failure.exception))
2700
2701 def test_custom_hex_error_is_wrapped(self):
2702 # Check hex codec gives a good error for malformed input
2703 msg = "^decoding with 'hex_codec' codec failed"
2704 with self.assertRaisesRegex(Exception, msg) as failure:
2705 codecs.decode(b"hello", "hex_codec")
2706 self.assertIsInstance(failure.exception.__cause__,
2707 type(failure.exception))
2708
2709 # Unfortunately, the bz2 module throws OSError, which the codec
2710 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002711
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002712 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2713 def test_aliases(self):
2714 for codec_name, aliases in transform_aliases.items():
2715 expected_name = codecs.lookup(codec_name).name
2716 for alias in aliases:
2717 with self.subTest(alias=alias):
2718 info = codecs.lookup(alias)
2719 self.assertEqual(info.name, expected_name)
2720
Martin Panter06171bd2015-09-12 00:34:28 +00002721 def test_quopri_stateless(self):
2722 # Should encode with quotetabs=True
2723 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2724 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2725 # But should still support unescaped tabs and spaces
2726 unescaped = b"space tab eol\n"
2727 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2728
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002729 def test_uu_invalid(self):
2730 # Missing "begin" line
2731 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2732
Nick Coghlan8b097b42013-11-13 23:49:21 +10002733
2734# The codec system tries to wrap exceptions in order to ensure the error
2735# mentions the operation being performed and the codec involved. We
2736# currently *only* want this to happen for relatively stateless
2737# exceptions, where the only significant information they contain is their
2738# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002739
2740# Use a local codec registry to avoid appearing to leak objects when
Martin Panter119e5022016-04-16 09:28:57 +00002741# registering multiple search functions
Nick Coghlan4e553e22013-11-16 00:35:34 +10002742_TEST_CODECS = {}
2743
2744def _get_test_codec(codec_name):
2745 return _TEST_CODECS.get(codec_name)
2746codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2747
Nick Coghlan8fad1672014-09-15 23:50:44 +12002748try:
2749 # Issue #22166: Also need to clear the internal cache in CPython
2750 from _codecs import _forget_codec
2751except ImportError:
2752 def _forget_codec(codec_name):
2753 pass
2754
2755
Nick Coghlan8b097b42013-11-13 23:49:21 +10002756class ExceptionChainingTest(unittest.TestCase):
2757
2758 def setUp(self):
2759 # There's no way to unregister a codec search function, so we just
2760 # ensure we render this one fairly harmless after the test
2761 # case finishes by using the test case repr as the codec name
2762 # The codecs module normalizes codec names, although this doesn't
2763 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002764 # We also make sure we use a truly unique id for the custom codec
2765 # to avoid issues with the codec cache when running these tests
2766 # multiple times (e.g. when hunting for refleaks)
2767 unique_id = repr(self) + str(id(self))
2768 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2769
2770 # We store the object to raise on the instance because of a bad
2771 # interaction between the codec caching (which means we can't
2772 # recreate the codec entry) and regrtest refleak hunting (which
2773 # runs the same test instance multiple times). This means we
2774 # need to ensure the codecs call back in to the instance to find
2775 # out which exception to raise rather than binding them in a
2776 # closure to an object that may change on the next run
2777 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002778
Nick Coghlan4e553e22013-11-16 00:35:34 +10002779 def tearDown(self):
2780 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002781 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2782 encodings._cache.pop(self.codec_name, None)
2783 try:
2784 _forget_codec(self.codec_name)
2785 except KeyError:
2786 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002787
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002788 def set_codec(self, encode, decode):
2789 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002790 name=self.codec_name)
2791 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002792
2793 @contextlib.contextmanager
2794 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002795 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002796 operation, self.codec_name, exc_type.__name__, msg)
2797 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2798 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002799 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002800 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002801
2802 def raise_obj(self, *args, **kwds):
2803 # Helper to dynamically change the object raised by a test codec
2804 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002805
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002806 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002807 self.obj_to_raise = obj_to_raise
2808 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002809 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002810 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002811 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002812 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002813 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002814 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002815 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002816 codecs.decode(b"bytes input", self.codec_name)
2817
2818 def test_raise_by_type(self):
2819 self.check_wrapped(RuntimeError, "")
2820
2821 def test_raise_by_value(self):
2822 msg = "This should be wrapped"
2823 self.check_wrapped(RuntimeError(msg), msg)
2824
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002825 def test_raise_grandchild_subclass_exact_size(self):
2826 msg = "This should be wrapped"
2827 class MyRuntimeError(RuntimeError):
2828 __slots__ = ()
2829 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2830
2831 def test_raise_subclass_with_weakref_support(self):
2832 msg = "This should be wrapped"
2833 class MyRuntimeError(RuntimeError):
2834 pass
2835 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2836
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002837 def check_not_wrapped(self, obj_to_raise, msg):
2838 def raise_obj(*args, **kwds):
2839 raise obj_to_raise
2840 self.set_codec(raise_obj, raise_obj)
2841 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002842 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002843 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002844 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002845 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002846 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002847 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002848 codecs.decode(b"bytes input", self.codec_name)
2849
2850 def test_init_override_is_not_wrapped(self):
2851 class CustomInit(RuntimeError):
2852 def __init__(self):
2853 pass
2854 self.check_not_wrapped(CustomInit, "")
2855
2856 def test_new_override_is_not_wrapped(self):
2857 class CustomNew(RuntimeError):
2858 def __new__(cls):
2859 return super().__new__(cls)
2860 self.check_not_wrapped(CustomNew, "")
2861
2862 def test_instance_attribute_is_not_wrapped(self):
2863 msg = "This should NOT be wrapped"
2864 exc = RuntimeError(msg)
2865 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002866 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002867
2868 def test_non_str_arg_is_not_wrapped(self):
2869 self.check_not_wrapped(RuntimeError(1), "1")
2870
2871 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002872 msg_re = r"^\('a', 'b', 'c'\)$"
2873 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002874
2875 # http://bugs.python.org/issue19609
2876 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002877 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002878 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002879 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002880 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002881 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002882 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002883 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002884 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002885 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002886 codecs.decode(b"bytes input", self.codec_name)
2887
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002888 def test_unflagged_non_text_codec_handling(self):
2889 # The stdlib non-text codecs are now marked so they're
2890 # pre-emptively skipped by the text model related methods
2891 # However, third party codecs won't be flagged, so we still make
2892 # sure the case where an inappropriate output type is produced is
2893 # handled appropriately
2894 def encode_to_str(*args, **kwds):
2895 return "not bytes!", 0
2896 def decode_to_bytes(*args, **kwds):
2897 return b"not str!", 0
2898 self.set_codec(encode_to_str, decode_to_bytes)
2899 # No input or output type checks on the codecs module functions
2900 encoded = codecs.encode(None, self.codec_name)
2901 self.assertEqual(encoded, "not bytes!")
2902 decoded = codecs.decode(None, self.codec_name)
2903 self.assertEqual(decoded, b"not str!")
2904 # Text model methods should complain
2905 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
R David Murray44b548d2016-09-08 13:59:53 -04002906 r"use codecs.encode\(\) to encode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002907 msg = fmt.format(self.codec_name)
2908 with self.assertRaisesRegex(TypeError, msg):
2909 "str_input".encode(self.codec_name)
2910 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
R David Murray44b548d2016-09-08 13:59:53 -04002911 r"use codecs.decode\(\) to decode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002912 msg = fmt.format(self.codec_name)
2913 with self.assertRaisesRegex(TypeError, msg):
2914 b"bytes input".decode(self.codec_name)
2915
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002916
Georg Brandl02524622010-12-02 18:06:51 +00002917
Victor Stinner62be4fb2011-10-18 21:46:37 +02002918@unittest.skipUnless(sys.platform == 'win32',
2919 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002920class CodePageTest(unittest.TestCase):
2921 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002922
Victor Stinner3a50e702011-10-18 21:21:00 +02002923 def test_invalid_code_page(self):
2924 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2925 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002926 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2927 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002928
2929 def test_code_page_name(self):
2930 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2931 codecs.code_page_encode, 932, '\xff')
2932 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002933 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002934 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002935 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002936
2937 def check_decode(self, cp, tests):
2938 for raw, errors, expected in tests:
2939 if expected is not None:
2940 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01002941 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002942 except UnicodeDecodeError as err:
2943 self.fail('Unable to decode %a from "cp%s" with '
2944 'errors=%r: %s' % (raw, cp, errors, err))
2945 self.assertEqual(decoded[0], expected,
2946 '%a.decode("cp%s", %r)=%a != %a'
2947 % (raw, cp, errors, decoded[0], expected))
2948 # assert 0 <= decoded[1] <= len(raw)
2949 self.assertGreaterEqual(decoded[1], 0)
2950 self.assertLessEqual(decoded[1], len(raw))
2951 else:
2952 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01002953 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002954
2955 def check_encode(self, cp, tests):
2956 for text, errors, expected in tests:
2957 if expected is not None:
2958 try:
2959 encoded = codecs.code_page_encode(cp, text, errors)
2960 except UnicodeEncodeError as err:
2961 self.fail('Unable to encode %a to "cp%s" with '
2962 'errors=%r: %s' % (text, cp, errors, err))
2963 self.assertEqual(encoded[0], expected,
2964 '%a.encode("cp%s", %r)=%a != %a'
2965 % (text, cp, errors, encoded[0], expected))
2966 self.assertEqual(encoded[1], len(text))
2967 else:
2968 self.assertRaises(UnicodeEncodeError,
2969 codecs.code_page_encode, cp, text, errors)
2970
2971 def test_cp932(self):
2972 self.check_encode(932, (
2973 ('abc', 'strict', b'abc'),
2974 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002975 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002976 ('\xff', 'strict', None),
2977 ('[\xff]', 'ignore', b'[]'),
2978 ('[\xff]', 'replace', b'[y]'),
2979 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002980 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02002981 ('[\xff]', 'namereplace',
2982 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002983 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002984 ('\udcff', 'strict', None),
2985 ('[\udcff]', 'surrogateescape', b'[\xff]'),
2986 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002987 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002988 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002989 (b'abc', 'strict', 'abc'),
2990 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2991 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002992 (b'[\xff]', 'strict', None),
2993 (b'[\xff]', 'ignore', '[]'),
2994 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002995 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002996 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002997 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002998 (b'\x81\x00abc', 'strict', None),
2999 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003000 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01003001 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003002 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003003
3004 def test_cp1252(self):
3005 self.check_encode(1252, (
3006 ('abc', 'strict', b'abc'),
3007 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
3008 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003009 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003010 ('\u0141', 'strict', None),
3011 ('\u0141', 'ignore', b''),
3012 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003013 ('\udc98', 'surrogateescape', b'\x98'),
3014 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003015 ))
3016 self.check_decode(1252, (
3017 (b'abc', 'strict', 'abc'),
3018 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3019 (b'\xff', 'strict', '\xff'),
3020 ))
3021
3022 def test_cp_utf7(self):
3023 cp = 65000
3024 self.check_encode(cp, (
3025 ('abc', 'strict', b'abc'),
3026 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
3027 ('\U0010ffff', 'strict', b'+2//f/w-'),
3028 ('\udc80', 'strict', b'+3IA-'),
3029 ('\ufffd', 'strict', b'+//0-'),
3030 ))
3031 self.check_decode(cp, (
3032 (b'abc', 'strict', 'abc'),
3033 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3034 (b'+2//f/w-', 'strict', '\U0010ffff'),
3035 (b'+3IA-', 'strict', '\udc80'),
3036 (b'+//0-', 'strict', '\ufffd'),
3037 # invalid bytes
3038 (b'[+/]', 'strict', '[]'),
3039 (b'[\xff]', 'strict', '[\xff]'),
3040 ))
3041
Victor Stinner3a50e702011-10-18 21:21:00 +02003042 def test_multibyte_encoding(self):
3043 self.check_decode(932, (
3044 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3045 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3046 ))
3047 self.check_decode(self.CP_UTF8, (
3048 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3049 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3050 ))
Steve Dowerf5aba582016-09-06 19:42:27 -07003051 self.check_encode(self.CP_UTF8, (
3052 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3053 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3054 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003055
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003056 def test_code_page_decode_flags(self):
3057 # Issue #36312: For some code pages (e.g. UTF-7) flags for
3058 # MultiByteToWideChar() must be set to 0.
Paul Monson62dfd7d2019-04-25 11:36:45 -07003059 if support.verbose:
3060 sys.stdout.write('\n')
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003061 for cp in (50220, 50221, 50222, 50225, 50227, 50229,
3062 *range(57002, 57011+1), 65000):
Paul Monson62dfd7d2019-04-25 11:36:45 -07003063 # On small versions of Windows like Windows IoT
3064 # not all codepages are present.
3065 # A missing codepage causes an OSError exception
3066 # so check for the codepage before decoding
3067 if is_code_page_present(cp):
3068 self.assertEqual(codecs.code_page_decode(cp, b'abc'), ('abc', 3), f'cp{cp}')
3069 else:
3070 if support.verbose:
3071 print(f" skipping cp={cp}")
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003072 self.assertEqual(codecs.code_page_decode(42, b'abc'),
3073 ('\uf061\uf062\uf063', 3))
3074
Victor Stinner3a50e702011-10-18 21:21:00 +02003075 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003076 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3077 self.assertEqual(decoded, ('', 0))
3078
Victor Stinner3a50e702011-10-18 21:21:00 +02003079 decoded = codecs.code_page_decode(932,
3080 b'\xe9\x80\xe9', 'strict',
3081 False)
3082 self.assertEqual(decoded, ('\u9a3e', 2))
3083
3084 decoded = codecs.code_page_decode(932,
3085 b'\xe9\x80\xe9\x80', 'strict',
3086 False)
3087 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3088
3089 decoded = codecs.code_page_decode(932,
3090 b'abc', 'strict',
3091 False)
3092 self.assertEqual(decoded, ('abc', 3))
3093
Steve Dowerf5aba582016-09-06 19:42:27 -07003094 def test_mbcs_alias(self):
3095 # Check that looking up our 'default' codepage will return
3096 # mbcs when we don't have a more specific one available
Victor Stinner91106cd2017-12-13 12:29:09 +01003097 with mock.patch('_winapi.GetACP', return_value=123):
Steve Dowerf5aba582016-09-06 19:42:27 -07003098 codec = codecs.lookup('cp123')
3099 self.assertEqual(codec.name, 'mbcs')
Steve Dowerf5aba582016-09-06 19:42:27 -07003100
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003101 @support.bigmemtest(size=2**31, memuse=7, dry_run=False)
Steve Dower7ebdda02019-08-21 16:22:33 -07003102 def test_large_input(self, size):
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003103 # Test input longer than INT_MAX.
3104 # Input should contain undecodable bytes before and after
3105 # the INT_MAX limit.
Steve Dower7ebdda02019-08-21 16:22:33 -07003106 encoded = (b'01234567' * ((size//8)-1) +
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003107 b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff')
Steve Dower7ebdda02019-08-21 16:22:33 -07003108 self.assertEqual(len(encoded), size+2)
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003109 decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True)
3110 self.assertEqual(decoded[1], len(encoded))
3111 del encoded
3112 self.assertEqual(len(decoded[0]), decoded[1])
3113 self.assertEqual(decoded[0][:10], '0123456701')
3114 self.assertEqual(decoded[0][-20:],
3115 '6701234567'
3116 '\udc85\udc86\udcea\udceb\udcec'
3117 '\udcef\udcfc\udcfd\udcfe\udcff')
3118
Steve Dower7ebdda02019-08-21 16:22:33 -07003119 @support.bigmemtest(size=2**31, memuse=6, dry_run=False)
3120 def test_large_utf8_input(self, size):
3121 # Test input longer than INT_MAX.
3122 # Input should contain a decodable multi-byte character
3123 # surrounding INT_MAX
3124 encoded = (b'0123456\xed\x84\x80' * (size//8))
3125 self.assertEqual(len(encoded), size // 8 * 10)
3126 decoded = codecs.code_page_decode(65001, encoded, 'ignore', True)
3127 self.assertEqual(decoded[1], len(encoded))
3128 del encoded
3129 self.assertEqual(len(decoded[0]), size)
3130 self.assertEqual(decoded[0][:10], '0123456\ud10001')
3131 self.assertEqual(decoded[0][-11:], '56\ud1000123456\ud100')
3132
Victor Stinner3a50e702011-10-18 21:21:00 +02003133
Victor Stinnerf96418d2015-09-21 23:06:27 +02003134class ASCIITest(unittest.TestCase):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003135 def test_encode(self):
3136 self.assertEqual('abc123'.encode('ascii'), b'abc123')
3137
3138 def test_encode_error(self):
3139 for data, error_handler, expected in (
3140 ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3141 ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3142 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003143 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3144 b'[\\x80\\xff\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003145 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3146 ):
3147 with self.subTest(data=data, error_handler=error_handler,
3148 expected=expected):
3149 self.assertEqual(data.encode('ascii', error_handler),
3150 expected)
3151
3152 def test_encode_surrogateescape_error(self):
3153 with self.assertRaises(UnicodeEncodeError):
3154 # the first character can be decoded, but not the second
3155 '\udc80\xff'.encode('ascii', 'surrogateescape')
3156
Victor Stinnerf96418d2015-09-21 23:06:27 +02003157 def test_decode(self):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003158 self.assertEqual(b'abc'.decode('ascii'), 'abc')
3159
3160 def test_decode_error(self):
Victor Stinnerf96418d2015-09-21 23:06:27 +02003161 for data, error_handler, expected in (
3162 (b'[\x80\xff]', 'ignore', '[]'),
3163 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3164 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3165 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3166 ):
3167 with self.subTest(data=data, error_handler=error_handler,
3168 expected=expected):
3169 self.assertEqual(data.decode('ascii', error_handler),
3170 expected)
3171
3172
Victor Stinnerc3713e92015-09-29 12:32:13 +02003173class Latin1Test(unittest.TestCase):
3174 def test_encode(self):
3175 for data, expected in (
3176 ('abc', b'abc'),
3177 ('\x80\xe9\xff', b'\x80\xe9\xff'),
3178 ):
3179 with self.subTest(data=data, expected=expected):
3180 self.assertEqual(data.encode('latin1'), expected)
3181
3182 def test_encode_errors(self):
3183 for data, error_handler, expected in (
3184 ('[\u20ac\udc80]', 'ignore', b'[]'),
3185 ('[\u20ac\udc80]', 'replace', b'[??]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003186 ('[\u20ac\U000abcde]', 'backslashreplace',
3187 b'[\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003188 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3189 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3190 ):
3191 with self.subTest(data=data, error_handler=error_handler,
3192 expected=expected):
3193 self.assertEqual(data.encode('latin1', error_handler),
3194 expected)
3195
3196 def test_encode_surrogateescape_error(self):
3197 with self.assertRaises(UnicodeEncodeError):
3198 # the first character can be decoded, but not the second
3199 '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3200
3201 def test_decode(self):
3202 for data, expected in (
3203 (b'abc', 'abc'),
3204 (b'[\x80\xff]', '[\x80\xff]'),
3205 ):
3206 with self.subTest(data=data, expected=expected):
3207 self.assertEqual(data.decode('latin1'), expected)
3208
3209
Jelle Zijlstrab3be4072019-05-22 08:18:26 -07003210class StreamRecoderTest(unittest.TestCase):
3211 def test_writelines(self):
3212 bio = io.BytesIO()
3213 codec = codecs.lookup('ascii')
3214 sr = codecs.StreamRecoder(bio, codec.encode, codec.decode,
3215 encodings.ascii.StreamReader, encodings.ascii.StreamWriter)
3216 sr.writelines([b'a', b'b'])
3217 self.assertEqual(bio.getvalue(), b'ab')
3218
3219 def test_write(self):
3220 bio = io.BytesIO()
3221 codec = codecs.lookup('latin1')
3222 # Recode from Latin-1 to utf-8.
3223 sr = codecs.StreamRecoder(bio, codec.encode, codec.decode,
3224 encodings.utf_8.StreamReader, encodings.utf_8.StreamWriter)
3225
3226 text = 'àñé'
3227 sr.write(text.encode('latin1'))
3228 self.assertEqual(bio.getvalue(), text.encode('utf-8'))
3229
Ammar Askara6ec1ce2019-05-31 12:44:01 -07003230 def test_seeking_read(self):
3231 bio = io.BytesIO('line1\nline2\nline3\n'.encode('utf-16-le'))
3232 sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le')
3233
3234 self.assertEqual(sr.readline(), b'line1\n')
3235 sr.seek(0)
3236 self.assertEqual(sr.readline(), b'line1\n')
3237 self.assertEqual(sr.readline(), b'line2\n')
3238 self.assertEqual(sr.readline(), b'line3\n')
3239 self.assertEqual(sr.readline(), b'')
3240
3241 def test_seeking_write(self):
3242 bio = io.BytesIO('123456789\n'.encode('utf-16-le'))
3243 sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le')
3244
3245 # Test that seek() only resets its internal buffer when offset
3246 # and whence are zero.
3247 sr.seek(2)
3248 sr.write(b'\nabc\n')
3249 self.assertEqual(sr.readline(), b'789\n')
3250 sr.seek(0)
3251 self.assertEqual(sr.readline(), b'1\n')
3252 self.assertEqual(sr.readline(), b'abc\n')
3253 self.assertEqual(sr.readline(), b'789\n')
3254
Jelle Zijlstrab3be4072019-05-22 08:18:26 -07003255
Victor Stinner3d4226a2018-08-29 22:21:32 +02003256@unittest.skipIf(_testcapi is None, 'need _testcapi module')
3257class LocaleCodecTest(unittest.TestCase):
3258 """
3259 Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex().
3260 """
3261 ENCODING = sys.getfilesystemencoding()
3262 STRINGS = ("ascii", "ulatin1:\xa7\xe9",
3263 "u255:\xff",
3264 "UCS:\xe9\u20ac\U0010ffff",
3265 "surrogates:\uDC80\uDCFF")
3266 BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff")
3267 SURROGATES = "\uDC80\uDCFF"
3268
3269 def encode(self, text, errors="strict"):
3270 return _testcapi.EncodeLocaleEx(text, 0, errors)
3271
3272 def check_encode_strings(self, errors):
3273 for text in self.STRINGS:
3274 with self.subTest(text=text):
3275 try:
3276 expected = text.encode(self.ENCODING, errors)
3277 except UnicodeEncodeError:
3278 with self.assertRaises(RuntimeError) as cm:
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003279 self.encode(text, errors)
Victor Stinner3d4226a2018-08-29 22:21:32 +02003280 errmsg = str(cm.exception)
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003281 self.assertRegex(errmsg, r"encode error: pos=[0-9]+, reason=")
Victor Stinner3d4226a2018-08-29 22:21:32 +02003282 else:
3283 encoded = self.encode(text, errors)
3284 self.assertEqual(encoded, expected)
3285
3286 def test_encode_strict(self):
3287 self.check_encode_strings("strict")
3288
3289 def test_encode_surrogateescape(self):
3290 self.check_encode_strings("surrogateescape")
3291
3292 def test_encode_surrogatepass(self):
3293 try:
3294 self.encode('', 'surrogatepass')
3295 except ValueError as exc:
3296 if str(exc) == 'unsupported error handler':
3297 self.skipTest(f"{self.ENCODING!r} encoder doesn't support "
3298 f"surrogatepass error handler")
3299 else:
3300 raise
3301
3302 self.check_encode_strings("surrogatepass")
3303
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003304 def test_encode_unsupported_error_handler(self):
3305 with self.assertRaises(ValueError) as cm:
3306 self.encode('', 'backslashreplace')
3307 self.assertEqual(str(cm.exception), 'unsupported error handler')
3308
Victor Stinner3d4226a2018-08-29 22:21:32 +02003309 def decode(self, encoded, errors="strict"):
3310 return _testcapi.DecodeLocaleEx(encoded, 0, errors)
3311
3312 def check_decode_strings(self, errors):
3313 is_utf8 = (self.ENCODING == "utf-8")
3314 if is_utf8:
3315 encode_errors = 'surrogateescape'
3316 else:
3317 encode_errors = 'strict'
3318
3319 strings = list(self.BYTES_STRINGS)
3320 for text in self.STRINGS:
3321 try:
3322 encoded = text.encode(self.ENCODING, encode_errors)
3323 if encoded not in strings:
3324 strings.append(encoded)
3325 except UnicodeEncodeError:
3326 encoded = None
3327
3328 if is_utf8:
3329 encoded2 = text.encode(self.ENCODING, 'surrogatepass')
3330 if encoded2 != encoded:
3331 strings.append(encoded2)
3332
3333 for encoded in strings:
3334 with self.subTest(encoded=encoded):
3335 try:
3336 expected = encoded.decode(self.ENCODING, errors)
3337 except UnicodeDecodeError:
3338 with self.assertRaises(RuntimeError) as cm:
3339 self.decode(encoded, errors)
3340 errmsg = str(cm.exception)
3341 self.assertTrue(errmsg.startswith("decode error: "), errmsg)
3342 else:
3343 decoded = self.decode(encoded, errors)
3344 self.assertEqual(decoded, expected)
3345
3346 def test_decode_strict(self):
3347 self.check_decode_strings("strict")
3348
3349 def test_decode_surrogateescape(self):
3350 self.check_decode_strings("surrogateescape")
3351
3352 def test_decode_surrogatepass(self):
3353 try:
3354 self.decode(b'', 'surrogatepass')
3355 except ValueError as exc:
3356 if str(exc) == 'unsupported error handler':
3357 self.skipTest(f"{self.ENCODING!r} decoder doesn't support "
3358 f"surrogatepass error handler")
3359 else:
3360 raise
3361
3362 self.check_decode_strings("surrogatepass")
3363
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003364 def test_decode_unsupported_error_handler(self):
3365 with self.assertRaises(ValueError) as cm:
3366 self.decode(b'', 'backslashreplace')
3367 self.assertEqual(str(cm.exception), 'unsupported error handler')
3368
Victor Stinner3d4226a2018-08-29 22:21:32 +02003369
Zethb3b48c82019-09-09 15:50:36 +01003370class Rot13Test(unittest.TestCase):
3371 """Test the educational ROT-13 codec."""
3372 def test_encode(self):
3373 ciphertext = codecs.encode("Caesar liked ciphers", 'rot-13')
3374 self.assertEqual(ciphertext, 'Pnrfne yvxrq pvcuref')
3375
3376 def test_decode(self):
3377 plaintext = codecs.decode('Rg gh, Oehgr?', 'rot-13')
3378 self.assertEqual(plaintext, 'Et tu, Brute?')
3379
3380 def test_incremental_encode(self):
3381 encoder = codecs.getincrementalencoder('rot-13')()
3382 ciphertext = encoder.encode('ABBA nag Cheryl Baker')
3383 self.assertEqual(ciphertext, 'NOON ant Purely Onxre')
3384
3385 def test_incremental_decode(self):
3386 decoder = codecs.getincrementaldecoder('rot-13')()
3387 plaintext = decoder.decode('terra Ares envy tha')
3388 self.assertEqual(plaintext, 'green Nerf rail gun')
3389
3390
3391class Rot13UtilTest(unittest.TestCase):
3392 """Test the ROT-13 codec via rot13 function,
3393 i.e. the user has done something like:
3394 $ echo "Hello World" | python -m encodings.rot_13
3395 """
3396 def test_rot13_func(self):
3397 infile = io.StringIO('Gb or, be abg gb or, gung vf gur dhrfgvba')
3398 outfile = io.StringIO()
3399 encodings.rot_13.rot13(infile, outfile)
3400 outfile.seek(0)
3401 plain_text = outfile.read()
3402 self.assertEqual(
3403 plain_text,
3404 'To be, or not to be, that is the question')
3405
3406
Fred Drake2e2be372001-09-20 21:33:42 +00003407if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003408 unittest.main()