blob: 3aec34c7f167d649589016022b040b0ebdfcd5e8 [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
Nick Coghlanc72e4e62013-11-22 22:39:36 +10007import encodings
Victor Stinner91106cd2017-12-13 12:29:09 +01008from unittest import mock
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Antoine Pitrou00b2c862011-10-05 13:01:41 +020012try:
Victor Stinner3d4226a2018-08-29 22:21:32 +020013 import _testcapi
Pablo Galindo293dd232019-11-19 21:34:03 +000014except ImportError:
Victor Stinner3d4226a2018-08-29 22:21:32 +020015 _testcapi = None
16
17try:
Antoine Pitrou00b2c862011-10-05 13:01:41 +020018 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Paul Monson62dfd7d2019-04-25 11:36:45 -070030# On small versions of Windows like Windows IoT or Windows Nano Server not all codepages are present
31def is_code_page_present(cp):
Victor Stinner8f4ef3b2019-07-01 18:28:25 +020032 from ctypes import POINTER, WINFUNCTYPE, WinDLL
Paul Monson62dfd7d2019-04-25 11:36:45 -070033 from ctypes.wintypes import BOOL, UINT, BYTE, WCHAR, UINT, DWORD
34
35 MAX_LEADBYTES = 12 # 5 ranges, 2 bytes ea., 0 term.
36 MAX_DEFAULTCHAR = 2 # single or double byte
37 MAX_PATH = 260
38 class CPINFOEXW(ctypes.Structure):
39 _fields_ = [("MaxCharSize", UINT),
40 ("DefaultChar", BYTE*MAX_DEFAULTCHAR),
41 ("LeadByte", BYTE*MAX_LEADBYTES),
42 ("UnicodeDefaultChar", WCHAR),
43 ("CodePage", UINT),
44 ("CodePageName", WCHAR*MAX_PATH)]
45
46 prototype = WINFUNCTYPE(BOOL, UINT, DWORD, POINTER(CPINFOEXW))
47 GetCPInfoEx = prototype(("GetCPInfoExW", WinDLL("kernel32")))
48 info = CPINFOEXW()
49 return GetCPInfoEx(cp, 0, info)
Victor Stinnerf96418d2015-09-21 23:06:27 +020050
Walter Dörwald69652032004-09-07 20:24:22 +000051class Queue(object):
52 """
53 queue: write bytes at one end, read bytes from the other end
54 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000055 def __init__(self, buffer):
56 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000057
58 def write(self, chars):
59 self._buffer += chars
60
61 def read(self, size=-1):
62 if size<0:
63 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000064 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000065 return s
66 else:
67 s = self._buffer[:size]
68 self._buffer = self._buffer[size:]
69 return s
70
Victor Stinnerf96418d2015-09-21 23:06:27 +020071
Walter Dörwald3abcb012007-04-16 22:10:50 +000072class MixInCheckStateHandling:
73 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000074 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000075 d = codecs.getincrementaldecoder(encoding)()
76 part1 = d.decode(s[:i])
77 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000078 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000079 # Check that the condition stated in the documentation for
80 # IncrementalDecoder.getstate() holds
81 if not state[1]:
82 # reset decoder to the default state without anything buffered
83 d.setstate((state[0][:0], 0))
84 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000085 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000086 # The decoder must return to the same state
87 self.assertEqual(state, d.getstate())
88 # Create a new decoder and set it to the state
89 # we extracted from the old one
90 d = codecs.getincrementaldecoder(encoding)()
91 d.setstate(state)
92 part2 = d.decode(s[i:], True)
93 self.assertEqual(u, part1+part2)
94
95 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000096 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000097 d = codecs.getincrementalencoder(encoding)()
98 part1 = d.encode(u[:i])
99 state = d.getstate()
100 d = codecs.getincrementalencoder(encoding)()
101 d.setstate(state)
102 part2 = d.encode(u[i:], True)
103 self.assertEqual(s, part1+part2)
104
Victor Stinnerf96418d2015-09-21 23:06:27 +0200105
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200106class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000107 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +0000108 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000109 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +0000110 # the StreamReader and check that the results equal the appropriate
111 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000112 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200113 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000114 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000115 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000116 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +0000117 result += r.read()
118 self.assertEqual(result, partialresult)
119 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000120 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000121 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +0000122
Martin Panter7462b6492015-11-02 03:37:02 +0000123 # do the check again, this time using an incremental decoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000124 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000125 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000126 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000127 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000128 self.assertEqual(result, partialresult)
129 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000130 self.assertEqual(d.decode(b"", True), "")
131 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000132
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000133 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000134 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000135 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000136 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000137 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000138 self.assertEqual(result, partialresult)
139 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000140 self.assertEqual(d.decode(b"", True), "")
141 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000142
143 # check iterdecode()
144 encoded = input.encode(self.encoding)
145 self.assertEqual(
146 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000147 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000148 )
149
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000150 def test_readline(self):
151 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000152 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000153 return codecs.getreader(self.encoding)(stream)
154
Walter Dörwaldca199432006-03-06 22:39:12 +0000155 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200156 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000157 lines = []
158 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000159 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000160 if not line:
161 break
162 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000163 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000164
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000165 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
166 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
167 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000168 self.assertEqual(readalllines(s, True), sexpected)
169 self.assertEqual(readalllines(s, False), sexpectednoends)
170 self.assertEqual(readalllines(s, True, 10), sexpected)
171 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000172
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200173 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000174 # Test long lines (multiple calls to read() in readline())
175 vw = []
176 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200177 for (i, lineend) in enumerate(lineends):
178 vw.append((i*200+200)*"\u3042" + lineend)
179 vwo.append((i*200+200)*"\u3042")
180 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
181 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000182
183 # Test lines where the first read might end with \r, so the
184 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000185 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200186 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000187 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000188 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000189 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000190 self.assertEqual(
191 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000192 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000193 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200194 self.assertEqual(
195 reader.readline(keepends=True),
196 "xxx\n",
197 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000198 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000199 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000200 self.assertEqual(
201 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000202 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000203 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200204 self.assertEqual(
205 reader.readline(keepends=False),
206 "xxx",
207 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000208
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200209 def test_mixed_readline_and_read(self):
210 lines = ["Humpty Dumpty sat on a wall,\n",
211 "Humpty Dumpty had a great fall.\r\n",
212 "All the king's horses and all the king's men\r",
213 "Couldn't put Humpty together again."]
214 data = ''.join(lines)
215 def getreader():
216 stream = io.BytesIO(data.encode(self.encoding))
217 return codecs.getreader(self.encoding)(stream)
218
219 # Issue #8260: Test readline() followed by read()
220 f = getreader()
221 self.assertEqual(f.readline(), lines[0])
222 self.assertEqual(f.read(), ''.join(lines[1:]))
223 self.assertEqual(f.read(), '')
224
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200225 # Issue #32110: Test readline() followed by read(n)
226 f = getreader()
227 self.assertEqual(f.readline(), lines[0])
228 self.assertEqual(f.read(1), lines[1][0])
229 self.assertEqual(f.read(0), '')
230 self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100])
231
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200232 # Issue #16636: Test readline() followed by readlines()
233 f = getreader()
234 self.assertEqual(f.readline(), lines[0])
235 self.assertEqual(f.readlines(), lines[1:])
236 self.assertEqual(f.read(), '')
237
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200238 # Test read(n) followed by read()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200239 f = getreader()
240 self.assertEqual(f.read(size=40, chars=5), data[:5])
241 self.assertEqual(f.read(), data[5:])
242 self.assertEqual(f.read(), '')
243
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200244 # Issue #32110: Test read(n) followed by read(n)
245 f = getreader()
246 self.assertEqual(f.read(size=40, chars=5), data[:5])
247 self.assertEqual(f.read(1), data[5])
248 self.assertEqual(f.read(0), '')
249 self.assertEqual(f.read(100), data[6:106])
250
251 # Issue #12446: Test read(n) followed by readlines()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200252 f = getreader()
253 self.assertEqual(f.read(size=40, chars=5), data[:5])
254 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
255 self.assertEqual(f.read(), '')
256
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000257 def test_bug1175396(self):
258 s = [
259 '<%!--===================================================\r\n',
260 ' BLOG index page: show recent articles,\r\n',
261 ' today\'s articles, or articles of a specific date.\r\n',
262 '========================================================--%>\r\n',
263 '<%@inputencoding="ISO-8859-1"%>\r\n',
264 '<%@pagetemplate=TEMPLATE.y%>\r\n',
265 '<%@import=import frog.util, frog%>\r\n',
266 '<%@import=import frog.objects%>\r\n',
267 '<%@import=from frog.storageerrors import StorageError%>\r\n',
268 '<%\r\n',
269 '\r\n',
270 'import logging\r\n',
271 'log=logging.getLogger("Snakelets.logger")\r\n',
272 '\r\n',
273 '\r\n',
274 'user=self.SessionCtx.user\r\n',
275 'storageEngine=self.SessionCtx.storageEngine\r\n',
276 '\r\n',
277 '\r\n',
278 'def readArticlesFromDate(date, count=None):\r\n',
279 ' entryids=storageEngine.listBlogEntries(date)\r\n',
280 ' entryids.reverse() # descending\r\n',
281 ' if count:\r\n',
282 ' entryids=entryids[:count]\r\n',
283 ' try:\r\n',
284 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
285 ' except StorageError,x:\r\n',
286 ' log.error("Error loading articles: "+str(x))\r\n',
287 ' self.abort("cannot load articles")\r\n',
288 '\r\n',
289 'showdate=None\r\n',
290 '\r\n',
291 'arg=self.Request.getArg()\r\n',
292 'if arg=="today":\r\n',
293 ' #-------------------- TODAY\'S ARTICLES\r\n',
294 ' self.write("<h2>Today\'s articles</h2>")\r\n',
295 ' showdate = frog.util.isodatestr() \r\n',
296 ' entries = readArticlesFromDate(showdate)\r\n',
297 'elif arg=="active":\r\n',
298 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
299 ' self.Yredirect("active.y")\r\n',
300 'elif arg=="login":\r\n',
301 ' #-------------------- LOGIN PAGE redirect\r\n',
302 ' self.Yredirect("login.y")\r\n',
303 'elif arg=="date":\r\n',
304 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
305 ' showdate = self.Request.getParameter("date")\r\n',
306 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
307 ' entries = readArticlesFromDate(showdate)\r\n',
308 'else:\r\n',
309 ' #-------------------- RECENT ARTICLES\r\n',
310 ' self.write("<h2>Recent articles</h2>")\r\n',
311 ' dates=storageEngine.listBlogEntryDates()\r\n',
312 ' if dates:\r\n',
313 ' entries=[]\r\n',
314 ' SHOWAMOUNT=10\r\n',
315 ' for showdate in dates:\r\n',
316 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
317 ' if len(entries)>=SHOWAMOUNT:\r\n',
318 ' break\r\n',
319 ' \r\n',
320 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000321 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200322 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000323 for (i, line) in enumerate(reader):
324 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000325
326 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000327 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200328 writer = codecs.getwriter(self.encoding)(q)
329 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000330
331 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000332 writer.write("foo\r")
333 self.assertEqual(reader.readline(keepends=False), "foo")
334 writer.write("\nbar\r")
335 self.assertEqual(reader.readline(keepends=False), "")
336 self.assertEqual(reader.readline(keepends=False), "bar")
337 writer.write("baz")
338 self.assertEqual(reader.readline(keepends=False), "baz")
339 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000340
341 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000342 writer.write("foo\r")
343 self.assertEqual(reader.readline(keepends=True), "foo\r")
344 writer.write("\nbar\r")
345 self.assertEqual(reader.readline(keepends=True), "\n")
346 self.assertEqual(reader.readline(keepends=True), "bar\r")
347 writer.write("baz")
348 self.assertEqual(reader.readline(keepends=True), "baz")
349 self.assertEqual(reader.readline(keepends=True), "")
350 writer.write("foo\r\n")
351 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000352
Walter Dörwald9fa09462005-01-10 12:01:39 +0000353 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000354 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
355 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
356 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000357
358 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000359 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200360 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000361 self.assertEqual(reader.readline(), s1)
362 self.assertEqual(reader.readline(), s2)
363 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000364 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000365
366 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000367 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
368 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
369 s3 = "stillokay:bbbbxx\r\n"
370 s4 = "broken!!!!badbad\r\n"
371 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000372
373 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000374 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200375 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000376 self.assertEqual(reader.readline(), s1)
377 self.assertEqual(reader.readline(), s2)
378 self.assertEqual(reader.readline(), s3)
379 self.assertEqual(reader.readline(), s4)
380 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000381 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000382
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200383 ill_formed_sequence_replace = "\ufffd"
384
385 def test_lone_surrogates(self):
386 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
387 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
388 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200389 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
390 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200391 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
392 "[&#56448;]".encode(self.encoding))
393 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
394 "[]".encode(self.encoding))
395 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
396 "[?]".encode(self.encoding))
397
Victor Stinner01ada392015-10-01 21:54:51 +0200398 # sequential surrogate characters
399 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
400 "[]".encode(self.encoding))
401 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
402 "[??]".encode(self.encoding))
403
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200404 bom = "".encode(self.encoding)
405 for before, after in [("\U00010fff", "A"), ("[", "]"),
406 ("A", "\U00010fff")]:
407 before_sequence = before.encode(self.encoding)[len(bom):]
408 after_sequence = after.encode(self.encoding)[len(bom):]
409 test_string = before + "\uDC80" + after
410 test_sequence = (bom + before_sequence +
411 self.ill_formed_sequence + after_sequence)
412 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
413 self.encoding)
414 self.assertEqual(test_string.encode(self.encoding,
415 "surrogatepass"),
416 test_sequence)
417 self.assertEqual(test_sequence.decode(self.encoding,
418 "surrogatepass"),
419 test_string)
420 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
421 before + after)
422 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
423 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200424 backslashreplace = ''.join('\\x%02x' % b
425 for b in self.ill_formed_sequence)
426 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
427 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200428
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +0200429 def test_incremental_surrogatepass(self):
430 # Test incremental decoder for surrogatepass handler:
431 # see issue #24214
Serhiy Storchaka894263b2019-06-25 11:54:18 +0300432 # High surrogate
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +0200433 data = '\uD901'.encode(self.encoding, 'surrogatepass')
434 for i in range(1, len(data)):
435 dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
436 self.assertEqual(dec.decode(data[:i]), '')
437 self.assertEqual(dec.decode(data[i:], True), '\uD901')
Serhiy Storchaka894263b2019-06-25 11:54:18 +0300438 # Low surrogate
439 data = '\uDC02'.encode(self.encoding, 'surrogatepass')
440 for i in range(1, len(data)):
441 dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
442 self.assertEqual(dec.decode(data[:i]), '')
443 self.assertEqual(dec.decode(data[i:]), '\uDC02')
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +0200444
Victor Stinnerf96418d2015-09-21 23:06:27 +0200445
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200446class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000447 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200448 if sys.byteorder == 'little':
449 ill_formed_sequence = b"\x80\xdc\x00\x00"
450 else:
451 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000452
453 spamle = (b'\xff\xfe\x00\x00'
454 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
455 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
456 spambe = (b'\x00\x00\xfe\xff'
457 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
458 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
459
460 def test_only_one_bom(self):
461 _,_,reader,writer = codecs.lookup(self.encoding)
462 # encode some stream
463 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200464 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000465 f.write("spam")
466 f.write("spam")
467 d = s.getvalue()
468 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000469 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000470 # try to read it back
471 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200472 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000473 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000474
475 def test_badbom(self):
476 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200477 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000478 self.assertRaises(UnicodeError, f.read)
479
480 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200481 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000482 self.assertRaises(UnicodeError, f.read)
483
484 def test_partial(self):
485 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200486 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000487 [
488 "", # first byte of BOM read
489 "", # second byte of BOM read
490 "", # third byte of BOM read
491 "", # fourth byte of BOM read => byteorder known
492 "",
493 "",
494 "",
495 "\x00",
496 "\x00",
497 "\x00",
498 "\x00",
499 "\x00\xff",
500 "\x00\xff",
501 "\x00\xff",
502 "\x00\xff",
503 "\x00\xff\u0100",
504 "\x00\xff\u0100",
505 "\x00\xff\u0100",
506 "\x00\xff\u0100",
507 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200508 "\x00\xff\u0100\uffff",
509 "\x00\xff\u0100\uffff",
510 "\x00\xff\u0100\uffff",
511 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000512 ]
513 )
514
Georg Brandl791f4e12009-09-17 11:41:24 +0000515 def test_handlers(self):
516 self.assertEqual(('\ufffd', 1),
517 codecs.utf_32_decode(b'\x01', 'replace', True))
518 self.assertEqual(('', 1),
519 codecs.utf_32_decode(b'\x01', 'ignore', True))
520
Walter Dörwald41980ca2007-08-16 21:55:45 +0000521 def test_errors(self):
522 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
523 b"\xff", "strict", True)
524
525 def test_decoder_state(self):
526 self.check_state_handling_decode(self.encoding,
527 "spamspam", self.spamle)
528 self.check_state_handling_decode(self.encoding,
529 "spamspam", self.spambe)
530
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000531 def test_issue8941(self):
532 # Issue #8941: insufficient result allocation when decoding into
533 # surrogate pairs on UCS-2 builds.
534 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
535 self.assertEqual('\U00010000' * 1024,
536 codecs.utf_32_decode(encoded_le)[0])
537 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
538 self.assertEqual('\U00010000' * 1024,
539 codecs.utf_32_decode(encoded_be)[0])
540
Victor Stinnerf96418d2015-09-21 23:06:27 +0200541
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200542class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000543 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200544 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000545
546 def test_partial(self):
547 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200548 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000549 [
550 "",
551 "",
552 "",
553 "\x00",
554 "\x00",
555 "\x00",
556 "\x00",
557 "\x00\xff",
558 "\x00\xff",
559 "\x00\xff",
560 "\x00\xff",
561 "\x00\xff\u0100",
562 "\x00\xff\u0100",
563 "\x00\xff\u0100",
564 "\x00\xff\u0100",
565 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200566 "\x00\xff\u0100\uffff",
567 "\x00\xff\u0100\uffff",
568 "\x00\xff\u0100\uffff",
569 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000570 ]
571 )
572
573 def test_simple(self):
574 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
575
576 def test_errors(self):
577 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
578 b"\xff", "strict", True)
579
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000580 def test_issue8941(self):
581 # Issue #8941: insufficient result allocation when decoding into
582 # surrogate pairs on UCS-2 builds.
583 encoded = b'\x00\x00\x01\x00' * 1024
584 self.assertEqual('\U00010000' * 1024,
585 codecs.utf_32_le_decode(encoded)[0])
586
Victor Stinnerf96418d2015-09-21 23:06:27 +0200587
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200588class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000589 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200590 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000591
592 def test_partial(self):
593 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200594 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000595 [
596 "",
597 "",
598 "",
599 "\x00",
600 "\x00",
601 "\x00",
602 "\x00",
603 "\x00\xff",
604 "\x00\xff",
605 "\x00\xff",
606 "\x00\xff",
607 "\x00\xff\u0100",
608 "\x00\xff\u0100",
609 "\x00\xff\u0100",
610 "\x00\xff\u0100",
611 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200612 "\x00\xff\u0100\uffff",
613 "\x00\xff\u0100\uffff",
614 "\x00\xff\u0100\uffff",
615 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000616 ]
617 )
618
619 def test_simple(self):
620 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
621
622 def test_errors(self):
623 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
624 b"\xff", "strict", True)
625
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000626 def test_issue8941(self):
627 # Issue #8941: insufficient result allocation when decoding into
628 # surrogate pairs on UCS-2 builds.
629 encoded = b'\x00\x01\x00\x00' * 1024
630 self.assertEqual('\U00010000' * 1024,
631 codecs.utf_32_be_decode(encoded)[0])
632
633
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200634class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000635 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200636 if sys.byteorder == 'little':
637 ill_formed_sequence = b"\x80\xdc"
638 else:
639 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000640
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000641 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
642 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000643
644 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000645 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000646 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000647 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200648 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000649 f.write("spam")
650 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000651 d = s.getvalue()
652 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000653 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000654 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000655 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200656 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000657 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000658
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000659 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000660 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200661 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000662 self.assertRaises(UnicodeError, f.read)
663
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000664 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200665 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000666 self.assertRaises(UnicodeError, f.read)
667
Walter Dörwald69652032004-09-07 20:24:22 +0000668 def test_partial(self):
669 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200670 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000671 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000672 "", # first byte of BOM read
673 "", # second byte of BOM read => byteorder known
674 "",
675 "\x00",
676 "\x00",
677 "\x00\xff",
678 "\x00\xff",
679 "\x00\xff\u0100",
680 "\x00\xff\u0100",
681 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200682 "\x00\xff\u0100\uffff",
683 "\x00\xff\u0100\uffff",
684 "\x00\xff\u0100\uffff",
685 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000686 ]
687 )
688
Georg Brandl791f4e12009-09-17 11:41:24 +0000689 def test_handlers(self):
690 self.assertEqual(('\ufffd', 1),
691 codecs.utf_16_decode(b'\x01', 'replace', True))
692 self.assertEqual(('', 1),
693 codecs.utf_16_decode(b'\x01', 'ignore', True))
694
Walter Dörwalde22d3392005-11-17 08:52:34 +0000695 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000696 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000697 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000698
699 def test_decoder_state(self):
700 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000701 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000702 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000703 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000704
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000705 def test_bug691291(self):
706 # Files are always opened in binary mode, even if no binary mode was
707 # specified. This means that no automatic conversion of '\n' is done
708 # on reading and writing.
709 s1 = 'Hello\r\nworld\r\n'
710
711 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200712 self.addCleanup(support.unlink, support.TESTFN)
713 with open(support.TESTFN, 'wb') as fp:
714 fp.write(s)
Victor Stinnere471e722019-10-28 15:40:08 +0100715 with codecs.open(support.TESTFN, 'r',
716 encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200717 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000718
Victor Stinnere471e722019-10-28 15:40:08 +0100719 def test_invalid_modes(self):
720 for mode in ('U', 'rU', 'r+U'):
721 with self.assertRaises(ValueError) as cm:
722 codecs.open(support.TESTFN, mode, encoding=self.encoding)
723 self.assertIn('invalid mode', str(cm.exception))
724
725 for mode in ('rt', 'wt', 'at', 'r+t'):
726 with self.assertRaises(ValueError) as cm:
727 codecs.open(support.TESTFN, mode, encoding=self.encoding)
728 self.assertIn("can't have text and binary mode at once",
729 str(cm.exception))
730
731
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200732class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000733 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200734 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000735
736 def test_partial(self):
737 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200738 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000739 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000740 "",
741 "\x00",
742 "\x00",
743 "\x00\xff",
744 "\x00\xff",
745 "\x00\xff\u0100",
746 "\x00\xff\u0100",
747 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200748 "\x00\xff\u0100\uffff",
749 "\x00\xff\u0100\uffff",
750 "\x00\xff\u0100\uffff",
751 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000752 ]
753 )
754
Walter Dörwalde22d3392005-11-17 08:52:34 +0000755 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200756 tests = [
757 (b'\xff', '\ufffd'),
758 (b'A\x00Z', 'A\ufffd'),
759 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
760 (b'\x00\xd8', '\ufffd'),
761 (b'\x00\xd8A', '\ufffd'),
762 (b'\x00\xd8A\x00', '\ufffdA'),
763 (b'\x00\xdcA\x00', '\ufffdA'),
764 ]
765 for raw, expected in tests:
766 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
767 raw, 'strict', True)
768 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000769
Victor Stinner53a9dd72010-12-08 22:25:45 +0000770 def test_nonbmp(self):
771 self.assertEqual("\U00010203".encode(self.encoding),
772 b'\x00\xd8\x03\xde')
773 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
774 "\U00010203")
775
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200776class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000777 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200778 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000779
780 def test_partial(self):
781 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200782 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000783 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000784 "",
785 "\x00",
786 "\x00",
787 "\x00\xff",
788 "\x00\xff",
789 "\x00\xff\u0100",
790 "\x00\xff\u0100",
791 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200792 "\x00\xff\u0100\uffff",
793 "\x00\xff\u0100\uffff",
794 "\x00\xff\u0100\uffff",
795 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000796 ]
797 )
798
Walter Dörwalde22d3392005-11-17 08:52:34 +0000799 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200800 tests = [
801 (b'\xff', '\ufffd'),
802 (b'\x00A\xff', 'A\ufffd'),
803 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
804 (b'\xd8\x00', '\ufffd'),
805 (b'\xd8\x00\xdc', '\ufffd'),
806 (b'\xd8\x00\x00A', '\ufffdA'),
807 (b'\xdc\x00\x00A', '\ufffdA'),
808 ]
809 for raw, expected in tests:
810 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
811 raw, 'strict', True)
812 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000813
Victor Stinner53a9dd72010-12-08 22:25:45 +0000814 def test_nonbmp(self):
815 self.assertEqual("\U00010203".encode(self.encoding),
816 b'\xd8\x00\xde\x03')
817 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
818 "\U00010203")
819
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200820class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000821 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200822 ill_formed_sequence = b"\xed\xb2\x80"
823 ill_formed_sequence_replace = "\ufffd" * 3
Victor Stinner01ada392015-10-01 21:54:51 +0200824 BOM = b''
Walter Dörwald69652032004-09-07 20:24:22 +0000825
826 def test_partial(self):
827 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200828 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000829 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000830 "\x00",
831 "\x00",
832 "\x00\xff",
833 "\x00\xff",
834 "\x00\xff\u07ff",
835 "\x00\xff\u07ff",
836 "\x00\xff\u07ff",
837 "\x00\xff\u07ff\u0800",
838 "\x00\xff\u07ff\u0800",
839 "\x00\xff\u07ff\u0800",
840 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200841 "\x00\xff\u07ff\u0800\uffff",
842 "\x00\xff\u07ff\u0800\uffff",
843 "\x00\xff\u07ff\u0800\uffff",
844 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000845 ]
846 )
847
Walter Dörwald3abcb012007-04-16 22:10:50 +0000848 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000849 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000850 self.check_state_handling_decode(self.encoding,
851 u, u.encode(self.encoding))
852
Victor Stinner1d65d912015-10-05 13:43:50 +0200853 def test_decode_error(self):
854 for data, error_handler, expected in (
855 (b'[\x80\xff]', 'ignore', '[]'),
856 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
857 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
858 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
859 ):
860 with self.subTest(data=data, error_handler=error_handler,
861 expected=expected):
862 self.assertEqual(data.decode(self.encoding, error_handler),
863 expected)
864
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000865 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200866 super().test_lone_surrogates()
867 # not sure if this is making sense for
868 # UTF-16 and UTF-32
Victor Stinner01ada392015-10-01 21:54:51 +0200869 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
870 self.BOM + b'[\x80]')
871
872 with self.assertRaises(UnicodeEncodeError) as cm:
873 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
874 exc = cm.exception
875 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000876
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000877 def test_surrogatepass_handler(self):
Victor Stinner01ada392015-10-01 21:54:51 +0200878 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
879 self.BOM + b"abc\xed\xa0\x80def")
880 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
881 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
882 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
883 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
884
885 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
Ezio Melottib3aedd42010-11-20 19:04:17 +0000886 "abc\ud800def")
Victor Stinner01ada392015-10-01 21:54:51 +0200887 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200888 "\U00010fff\uD800")
Victor Stinner01ada392015-10-01 21:54:51 +0200889
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000890 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700891 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200892 b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200893 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200894 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000895
Serhiy Storchaka894263b2019-06-25 11:54:18 +0300896 def test_incremental_errors(self):
897 # Test that the incremental decoder can fail with final=False.
898 # See issue #24214
899 cases = [b'\x80', b'\xBF', b'\xC0', b'\xC1', b'\xF5', b'\xF6', b'\xFF']
900 for prefix in (b'\xC2', b'\xDF', b'\xE0', b'\xE0\xA0', b'\xEF',
901 b'\xEF\xBF', b'\xF0', b'\xF0\x90', b'\xF0\x90\x80',
902 b'\xF4', b'\xF4\x8F', b'\xF4\x8F\xBF'):
903 for suffix in b'\x7F', b'\xC0':
904 cases.append(prefix + suffix)
905 cases.extend((b'\xE0\x80', b'\xE0\x9F', b'\xED\xA0\x80',
906 b'\xED\xBF\xBF', b'\xF0\x80', b'\xF0\x8F', b'\xF4\x90'))
907
908 for data in cases:
909 with self.subTest(data=data):
910 dec = codecs.getincrementaldecoder(self.encoding)()
911 self.assertRaises(UnicodeDecodeError, dec.decode, data)
912
Victor Stinnerf96418d2015-09-21 23:06:27 +0200913
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200914class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000915 encoding = "utf-7"
916
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300917 def test_ascii(self):
918 # Set D (directly encoded characters)
919 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
920 'abcdefghijklmnopqrstuvwxyz'
921 '0123456789'
922 '\'(),-./:?')
923 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
924 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
925 # Set O (optional direct characters)
926 set_o = ' !"#$%&*;<=>@[]^_`{|}'
927 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
928 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
929 # +
930 self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
931 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
932 # White spaces
933 ws = ' \t\n\r'
934 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
935 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
936 # Other ASCII characters
937 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
938 set(set_d + set_o + '+' + ws)))
939 self.assertEqual(other_ascii.encode(self.encoding),
940 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
941 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
942
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000943 def test_partial(self):
944 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200945 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000946 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200947 'a',
948 'a',
949 'a+',
950 'a+-',
951 'a+-b',
952 'a+-b',
953 'a+-b',
954 'a+-b',
955 'a+-b',
956 'a+-b\x00',
957 'a+-b\x00c',
958 'a+-b\x00c',
959 'a+-b\x00c',
960 'a+-b\x00c',
961 'a+-b\x00c',
962 'a+-b\x00c\x80',
963 'a+-b\x00c\x80d',
964 'a+-b\x00c\x80d',
965 'a+-b\x00c\x80d',
966 'a+-b\x00c\x80d',
967 'a+-b\x00c\x80d',
968 'a+-b\x00c\x80d\u0100',
969 'a+-b\x00c\x80d\u0100e',
970 'a+-b\x00c\x80d\u0100e',
971 'a+-b\x00c\x80d\u0100e',
972 'a+-b\x00c\x80d\u0100e',
973 'a+-b\x00c\x80d\u0100e',
974 'a+-b\x00c\x80d\u0100e',
975 'a+-b\x00c\x80d\u0100e',
976 'a+-b\x00c\x80d\u0100e',
977 'a+-b\x00c\x80d\u0100e\U00010000',
978 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000979 ]
980 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000981
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300982 def test_errors(self):
983 tests = [
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300984 (b'\xffb', '\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300985 (b'a\xffb', 'a\ufffdb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300986 (b'a\xff\xffb', 'a\ufffd\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300987 (b'a+IK', 'a\ufffd'),
988 (b'a+IK-b', 'a\ufffdb'),
989 (b'a+IK,b', 'a\ufffdb'),
990 (b'a+IKx', 'a\u20ac\ufffd'),
991 (b'a+IKx-b', 'a\u20ac\ufffdb'),
992 (b'a+IKwgr', 'a\u20ac\ufffd'),
993 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
994 (b'a+IKwgr,', 'a\u20ac\ufffd'),
995 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
996 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
997 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
998 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
999 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
1000 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
1001 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001002 (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
1003 (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
Zackery Spytze349bf22018-08-18 22:43:38 -06001004 (b'a+@b', 'a\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001005 ]
1006 for raw, expected in tests:
1007 with self.subTest(raw=raw):
1008 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
1009 raw, 'strict', True)
1010 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
1011
1012 def test_nonbmp(self):
1013 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1014 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1015 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001016 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1017 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1018 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1019 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1020 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1021 b'+IKwgrNgB3KA-')
1022 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1023 '\u20ac\u20ac\U000104A0')
1024 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1025 '\u20ac\u20ac\U000104A0')
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001026
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001027 def test_lone_surrogates(self):
1028 tests = [
1029 (b'a+2AE-b', 'a\ud801b'),
1030 (b'a+2AE\xffb', 'a\ufffdb'),
1031 (b'a+2AE', 'a\ufffd'),
1032 (b'a+2AEA-b', 'a\ufffdb'),
1033 (b'a+2AH-b', 'a\ufffdb'),
1034 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1035 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1036 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1037 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1038 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1039 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1040 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1041 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1042 ]
1043 for raw, expected in tests:
1044 with self.subTest(raw=raw):
1045 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001046
1047
Walter Dörwalde22d3392005-11-17 08:52:34 +00001048class UTF16ExTest(unittest.TestCase):
1049
1050 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001051 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001052
1053 def test_bad_args(self):
1054 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1055
1056class ReadBufferTest(unittest.TestCase):
1057
1058 def test_array(self):
1059 import array
1060 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001061 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +00001062 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001063 )
1064
1065 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +00001066 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +00001067
1068 def test_bad_args(self):
1069 self.assertRaises(TypeError, codecs.readbuffer_encode)
1070 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1071
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001072class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001073 encoding = "utf-8-sig"
Victor Stinner01ada392015-10-01 21:54:51 +02001074 BOM = codecs.BOM_UTF8
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001075
1076 def test_partial(self):
1077 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001078 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001079 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001080 "",
1081 "",
1082 "", # First BOM has been read and skipped
1083 "",
1084 "",
1085 "\ufeff", # Second BOM has been read and emitted
1086 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001087 "\ufeff\x00", # First byte of encoded "\xff" read
1088 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1089 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1090 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001091 "\ufeff\x00\xff\u07ff",
1092 "\ufeff\x00\xff\u07ff",
1093 "\ufeff\x00\xff\u07ff\u0800",
1094 "\ufeff\x00\xff\u07ff\u0800",
1095 "\ufeff\x00\xff\u07ff\u0800",
1096 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001097 "\ufeff\x00\xff\u07ff\u0800\uffff",
1098 "\ufeff\x00\xff\u07ff\u0800\uffff",
1099 "\ufeff\x00\xff\u07ff\u0800\uffff",
1100 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001101 ]
1102 )
1103
Thomas Wouters89f507f2006-12-13 04:49:30 +00001104 def test_bug1601501(self):
1105 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001106 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001107
Walter Dörwald3abcb012007-04-16 22:10:50 +00001108 def test_bom(self):
1109 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001110 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001111 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1112
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001113 def test_stream_bom(self):
1114 unistring = "ABC\u00A1\u2200XYZ"
1115 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1116
1117 reader = codecs.getreader("utf-8-sig")
1118 for sizehint in [None] + list(range(1, 11)) + \
1119 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001120 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001121 ostream = io.StringIO()
1122 while 1:
1123 if sizehint is not None:
1124 data = istream.read(sizehint)
1125 else:
1126 data = istream.read()
1127
1128 if not data:
1129 break
1130 ostream.write(data)
1131
1132 got = ostream.getvalue()
1133 self.assertEqual(got, unistring)
1134
1135 def test_stream_bare(self):
1136 unistring = "ABC\u00A1\u2200XYZ"
1137 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1138
1139 reader = codecs.getreader("utf-8-sig")
1140 for sizehint in [None] + list(range(1, 11)) + \
1141 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001142 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001143 ostream = io.StringIO()
1144 while 1:
1145 if sizehint is not None:
1146 data = istream.read(sizehint)
1147 else:
1148 data = istream.read()
1149
1150 if not data:
1151 break
1152 ostream.write(data)
1153
1154 got = ostream.getvalue()
1155 self.assertEqual(got, unistring)
1156
1157class EscapeDecodeTest(unittest.TestCase):
1158 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001159 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001160 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001161
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001162 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001163 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001164 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001165 b = bytes([b])
1166 if b != b'\\':
1167 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001168
1169 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001170 decode = codecs.escape_decode
1171 check = coding_checker(self, decode)
1172 check(b"[\\\n]", b"[]")
1173 check(br'[\"]', b'["]')
1174 check(br"[\']", b"[']")
R David Murray110b6fe2016-09-08 15:34:08 -04001175 check(br"[\\]", b"[\\]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001176 check(br"[\a]", b"[\x07]")
1177 check(br"[\b]", b"[\x08]")
1178 check(br"[\t]", b"[\x09]")
1179 check(br"[\n]", b"[\x0a]")
1180 check(br"[\v]", b"[\x0b]")
1181 check(br"[\f]", b"[\x0c]")
1182 check(br"[\r]", b"[\x0d]")
1183 check(br"[\7]", b"[\x07]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001184 check(br"[\78]", b"[\x078]")
1185 check(br"[\41]", b"[!]")
1186 check(br"[\418]", b"[!8]")
1187 check(br"[\101]", b"[A]")
1188 check(br"[\1010]", b"[A0]")
1189 check(br"[\501]", b"[A]")
1190 check(br"[\x41]", b"[A]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001191 check(br"[\x410]", b"[A0]")
R David Murray110b6fe2016-09-08 15:34:08 -04001192 for i in range(97, 123):
1193 b = bytes([i])
1194 if b not in b'abfnrtvx':
1195 with self.assertWarns(DeprecationWarning):
1196 check(b"\\" + b, b"\\" + b)
1197 with self.assertWarns(DeprecationWarning):
1198 check(b"\\" + b.upper(), b"\\" + b.upper())
1199 with self.assertWarns(DeprecationWarning):
1200 check(br"\8", b"\\8")
1201 with self.assertWarns(DeprecationWarning):
1202 check(br"\9", b"\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03001203 with self.assertWarns(DeprecationWarning):
1204 check(b"\\\xfa", b"\\\xfa")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001205
1206 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001207 decode = codecs.escape_decode
1208 self.assertRaises(ValueError, decode, br"\x")
1209 self.assertRaises(ValueError, decode, br"[\x]")
1210 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1211 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1212 self.assertRaises(ValueError, decode, br"\x0")
1213 self.assertRaises(ValueError, decode, br"[\x0]")
1214 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1215 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001216
Victor Stinnerf96418d2015-09-21 23:06:27 +02001217
Martin v. Löwis2548c732003-04-18 10:39:54 +00001218# From RFC 3492
1219punycode_testcases = [
1220 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001221 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1222 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001223 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001224 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001225 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001226 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001227 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001228 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001229 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001230 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001231 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1232 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1233 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001234 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001235 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001236 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1237 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1238 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001239 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001240 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001241 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001242 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1243 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1244 "\u0939\u0948\u0902",
1245 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001246
1247 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001248 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001249 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1250 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001251
1252 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001253 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1254 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1255 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001256 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1257 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001258
1259 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001260 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1261 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1262 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1263 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001264 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001265
1266 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001267 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1268 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1269 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1270 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1271 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001272 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001273
1274 # (K) Vietnamese:
1275 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1276 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001277 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1278 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1279 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1280 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001281 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001282
Martin v. Löwis2548c732003-04-18 10:39:54 +00001283 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001284 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001285 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001286
Martin v. Löwis2548c732003-04-18 10:39:54 +00001287 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001288 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1289 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1290 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001291 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001292
1293 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001294 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1295 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1296 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001297 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001298
1299 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001300 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001301 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001302
1303 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001304 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1305 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001306 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001307
1308 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001309 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001310 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001311
1312 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001313 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001314 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001315
1316 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001317 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1318 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001319 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001320 ]
1321
1322for i in punycode_testcases:
1323 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001324 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001325
Victor Stinnerf96418d2015-09-21 23:06:27 +02001326
Martin v. Löwis2548c732003-04-18 10:39:54 +00001327class PunycodeTest(unittest.TestCase):
1328 def test_encode(self):
1329 for uni, puny in punycode_testcases:
1330 # Need to convert both strings to lower case, since
1331 # some of the extended encodings use upper case, but our
1332 # code produces only lower case. Converting just puny to
1333 # lower is also insufficient, since some of the input characters
1334 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001335 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001336 str(uni.encode("punycode"), "ascii").lower(),
1337 str(puny, "ascii").lower()
1338 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001339
1340 def test_decode(self):
1341 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001342 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001343 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001344 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001345
Victor Stinnerf96418d2015-09-21 23:06:27 +02001346
Martin v. Löwis2548c732003-04-18 10:39:54 +00001347# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1348nameprep_tests = [
1349 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001350 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1351 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1352 b'\xb8\x8f\xef\xbb\xbf',
1353 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001354 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001355 (b'CAFE',
1356 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001357 # 3.3 Case folding 8bit U+00DF (german sharp s).
1358 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001359 (b'\xc3\x9f',
1360 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001361 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001362 (b'\xc4\xb0',
1363 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001364 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001365 (b'\xc5\x83\xcd\xba',
1366 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001367 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1368 # XXX: skip this as it fails in UCS-2 mode
1369 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1370 # 'telc\xe2\x88\x95kg\xcf\x83'),
1371 (None, None),
1372 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001373 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1374 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001375 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001376 (b'\xe1\xbe\xb7',
1377 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001378 # 3.9 Self-reverting case folding U+01F0 and normalization.
1379 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001380 (b'\xc7\xb0',
1381 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001382 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001383 (b'\xce\x90',
1384 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001385 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001386 (b'\xce\xb0',
1387 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001388 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001389 (b'\xe1\xba\x96',
1390 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001391 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001392 (b'\xe1\xbd\x96',
1393 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001394 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001395 (b' ',
1396 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001397 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001398 (b'\xc2\xa0',
1399 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001400 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001401 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001402 None),
1403 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001404 (b'\xe2\x80\x80',
1405 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001406 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001407 (b'\xe2\x80\x8b',
1408 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001409 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001410 (b'\xe3\x80\x80',
1411 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001412 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001413 (b'\x10\x7f',
1414 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001415 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001416 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001417 None),
1418 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001419 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001420 None),
1421 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001422 (b'\xef\xbb\xbf',
1423 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001424 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001425 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001426 None),
1427 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001428 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001429 None),
1430 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001431 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001432 None),
1433 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001434 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001435 None),
1436 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001437 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001438 None),
1439 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001440 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001441 None),
1442 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001443 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001444 None),
1445 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001446 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001447 None),
1448 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001449 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001450 None),
1451 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001452 (b'\xcd\x81',
1453 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001454 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001455 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001456 None),
1457 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001458 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001459 None),
1460 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001461 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001462 None),
1463 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001464 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001465 None),
1466 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001467 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001468 None),
1469 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001470 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001471 None),
1472 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001473 (b'foo\xef\xb9\xb6bar',
1474 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001475 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001476 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001477 None),
1478 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001479 (b'\xd8\xa71\xd8\xa8',
1480 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001481 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001482 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001483 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001484 # None),
1485 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001486 # 3.44 Larger test (shrinking).
1487 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001488 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1489 b'\xaa\xce\xb0\xe2\x80\x80',
1490 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001491 # 3.45 Larger test (expanding).
1492 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001493 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1494 b'\x80',
1495 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1496 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1497 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001498 ]
1499
1500
1501class NameprepTest(unittest.TestCase):
1502 def test_nameprep(self):
1503 from encodings.idna import nameprep
1504 for pos, (orig, prepped) in enumerate(nameprep_tests):
1505 if orig is None:
1506 # Skipped
1507 continue
1508 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001509 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001510 if prepped is None:
1511 # Input contains prohibited characters
1512 self.assertRaises(UnicodeError, nameprep, orig)
1513 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001514 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001515 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001516 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001517 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001518 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001519
Victor Stinnerf96418d2015-09-21 23:06:27 +02001520
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001521class IDNACodecTest(unittest.TestCase):
1522 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001523 self.assertEqual(str(b"python.org", "idna"), "python.org")
1524 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1525 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1526 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001527
1528 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001529 self.assertEqual("python.org".encode("idna"), b"python.org")
1530 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1531 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1532 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001533
Martin v. Löwis8b595142005-08-25 11:03:38 +00001534 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001535 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001536 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001537 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001538
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001539 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001540 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001541 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001542 "python.org"
1543 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001544 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001545 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001546 "python.org."
1547 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001548 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001549 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001550 "pyth\xf6n.org."
1551 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001552 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001553 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001554 "pyth\xf6n.org."
1555 )
1556
1557 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001558 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1559 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1560 self.assertEqual(decoder.decode(b"rg"), "")
1561 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001562
1563 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001564 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1565 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1566 self.assertEqual(decoder.decode(b"rg."), "org.")
1567 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001568
1569 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001570 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001571 b"".join(codecs.iterencode("python.org", "idna")),
1572 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001573 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001574 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001575 b"".join(codecs.iterencode("python.org.", "idna")),
1576 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001577 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001578 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001579 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1580 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001581 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001582 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001583 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1584 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001585 )
1586
1587 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001588 self.assertEqual(encoder.encode("\xe4x"), b"")
1589 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1590 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001591
1592 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001593 self.assertEqual(encoder.encode("\xe4x"), b"")
1594 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1595 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001596
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001597 def test_errors(self):
1598 """Only supports "strict" error handler"""
1599 "python.org".encode("idna", "strict")
1600 b"python.org".decode("idna", "strict")
1601 for errors in ("ignore", "replace", "backslashreplace",
1602 "surrogateescape"):
1603 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1604 self.assertRaises(Exception,
1605 b"python.org".decode, "idna", errors)
1606
Victor Stinnerf96418d2015-09-21 23:06:27 +02001607
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001608class CodecsModuleTest(unittest.TestCase):
1609
1610 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001611 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1612 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001613 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001614 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001615 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001616
Victor Stinnera57dfd02014-05-14 17:13:14 +02001617 # test keywords
1618 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1619 '\xe4\xf6\xfc')
1620 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1621 '[]')
1622
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001623 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001624 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1625 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001626 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001627 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001628 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001629 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001630
Victor Stinnera57dfd02014-05-14 17:13:14 +02001631 # test keywords
1632 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1633 b'\xe4\xf6\xfc')
1634 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1635 b'[]')
1636
Walter Dörwald063e1e82004-10-28 13:04:26 +00001637 def test_register(self):
1638 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001639 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001640
1641 def test_lookup(self):
1642 self.assertRaises(TypeError, codecs.lookup)
1643 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001644 self.assertRaises(LookupError, codecs.lookup, " ")
1645
1646 def test_getencoder(self):
1647 self.assertRaises(TypeError, codecs.getencoder)
1648 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1649
1650 def test_getdecoder(self):
1651 self.assertRaises(TypeError, codecs.getdecoder)
1652 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1653
1654 def test_getreader(self):
1655 self.assertRaises(TypeError, codecs.getreader)
1656 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1657
1658 def test_getwriter(self):
1659 self.assertRaises(TypeError, codecs.getwriter)
1660 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001661
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001662 def test_lookup_issue1813(self):
1663 # Issue #1813: under Turkish locales, lookup of some codecs failed
1664 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001665 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001666 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1667 try:
1668 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1669 except locale.Error:
1670 # Unsupported locale on this system
1671 self.skipTest('test needs Turkish locale')
1672 c = codecs.lookup('ASCII')
1673 self.assertEqual(c.name, 'ascii')
1674
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001675 def test_all(self):
1676 api = (
1677 "encode", "decode",
1678 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1679 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1680 "getencoder", "getdecoder", "getincrementalencoder",
1681 "getincrementaldecoder", "getreader", "getwriter",
1682 "register_error", "lookup_error",
1683 "strict_errors", "replace_errors", "ignore_errors",
1684 "xmlcharrefreplace_errors", "backslashreplace_errors",
1685 "namereplace_errors",
1686 "open", "EncodedFile",
1687 "iterencode", "iterdecode",
1688 "BOM", "BOM_BE", "BOM_LE",
1689 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1690 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1691 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1692 "StreamReaderWriter", "StreamRecoder",
1693 )
1694 self.assertCountEqual(api, codecs.__all__)
1695 for api in codecs.__all__:
1696 getattr(codecs, api)
1697
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001698 def test_open(self):
1699 self.addCleanup(support.unlink, support.TESTFN)
1700 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1701 with self.subTest(mode), \
1702 codecs.open(support.TESTFN, mode, 'ascii') as file:
1703 self.assertIsInstance(file, codecs.StreamReaderWriter)
1704
1705 def test_undefined(self):
1706 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1707 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1708 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1709 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1710 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1711 self.assertRaises(UnicodeError,
1712 codecs.encode, 'abc', 'undefined', errors)
1713 self.assertRaises(UnicodeError,
1714 codecs.decode, b'abc', 'undefined', errors)
1715
Victor Stinnerf96418d2015-09-21 23:06:27 +02001716
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001717class StreamReaderTest(unittest.TestCase):
1718
1719 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001720 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001721 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001722
1723 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001724 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001725 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001726
Victor Stinnerf96418d2015-09-21 23:06:27 +02001727
Thomas Wouters89f507f2006-12-13 04:49:30 +00001728class EncodedFileTest(unittest.TestCase):
1729
1730 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001731 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001732 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001733 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001734
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001735 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001736 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001737 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001738 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001739
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001740all_unicode_encodings = [
1741 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001742 "big5",
1743 "big5hkscs",
1744 "charmap",
1745 "cp037",
1746 "cp1006",
1747 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001748 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001749 "cp1140",
1750 "cp1250",
1751 "cp1251",
1752 "cp1252",
1753 "cp1253",
1754 "cp1254",
1755 "cp1255",
1756 "cp1256",
1757 "cp1257",
1758 "cp1258",
1759 "cp424",
1760 "cp437",
1761 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001762 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001763 "cp737",
1764 "cp775",
1765 "cp850",
1766 "cp852",
1767 "cp855",
1768 "cp856",
1769 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001770 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001771 "cp860",
1772 "cp861",
1773 "cp862",
1774 "cp863",
1775 "cp864",
1776 "cp865",
1777 "cp866",
1778 "cp869",
1779 "cp874",
1780 "cp875",
1781 "cp932",
1782 "cp949",
1783 "cp950",
1784 "euc_jis_2004",
1785 "euc_jisx0213",
1786 "euc_jp",
1787 "euc_kr",
1788 "gb18030",
1789 "gb2312",
1790 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001791 "hp_roman8",
1792 "hz",
1793 "idna",
1794 "iso2022_jp",
1795 "iso2022_jp_1",
1796 "iso2022_jp_2",
1797 "iso2022_jp_2004",
1798 "iso2022_jp_3",
1799 "iso2022_jp_ext",
1800 "iso2022_kr",
1801 "iso8859_1",
1802 "iso8859_10",
1803 "iso8859_11",
1804 "iso8859_13",
1805 "iso8859_14",
1806 "iso8859_15",
1807 "iso8859_16",
1808 "iso8859_2",
1809 "iso8859_3",
1810 "iso8859_4",
1811 "iso8859_5",
1812 "iso8859_6",
1813 "iso8859_7",
1814 "iso8859_8",
1815 "iso8859_9",
1816 "johab",
1817 "koi8_r",
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03001818 "koi8_t",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001819 "koi8_u",
Serhiy Storchakaad8a1c32015-05-12 23:16:55 +03001820 "kz1048",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001821 "latin_1",
1822 "mac_cyrillic",
1823 "mac_greek",
1824 "mac_iceland",
1825 "mac_latin2",
1826 "mac_roman",
1827 "mac_turkish",
1828 "palmos",
1829 "ptcp154",
1830 "punycode",
1831 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001832 "shift_jis",
1833 "shift_jis_2004",
1834 "shift_jisx0213",
1835 "tis_620",
1836 "unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001837 "utf_16",
1838 "utf_16_be",
1839 "utf_16_le",
1840 "utf_7",
1841 "utf_8",
1842]
1843
1844if hasattr(codecs, "mbcs_encode"):
1845 all_unicode_encodings.append("mbcs")
Steve Dowerf5aba582016-09-06 19:42:27 -07001846if hasattr(codecs, "oem_encode"):
1847 all_unicode_encodings.append("oem")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001848
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001849# The following encoding is not tested, because it's not supposed
1850# to work:
1851# "undefined"
1852
1853# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001854broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001855 "punycode",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001856]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001857
Victor Stinnerf96418d2015-09-21 23:06:27 +02001858
Walter Dörwald3abcb012007-04-16 22:10:50 +00001859class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001860 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001861 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001862 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001863 name = codecs.lookup(encoding).name
1864 if encoding.endswith("_codec"):
1865 name += "_codec"
1866 elif encoding == "latin_1":
1867 name = "latin_1"
1868 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001869
Inada Naoki6a16b182019-03-18 15:44:11 +09001870 (b, size) = codecs.getencoder(encoding)(s)
1871 self.assertEqual(size, len(s), "encoding=%r" % encoding)
1872 (chars, size) = codecs.getdecoder(encoding)(b)
1873 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001874
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001875 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001876 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001877 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001878 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001879 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001880 for c in s:
1881 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001882 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001883 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001884 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001885 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001886 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001887 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001888 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001889 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001890 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001891 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001892
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001893 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001894 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001895 try:
1896 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001897 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001898 pass
1899 else:
1900 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001901 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001902 for c in s:
1903 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001904 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001905 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001906 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001907 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001908 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001909 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001910 self.assertEqual(decodedresult, s,
1911 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001912
1913 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001914 result = "".join(codecs.iterdecode(
1915 codecs.iterencode(s, encoding), encoding))
1916 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001917
1918 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001919 result = "".join(codecs.iterdecode(
1920 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001921 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001922
Victor Stinner554f3f02010-06-16 23:33:54 +00001923 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001924 # check incremental decoder/encoder with errors argument
1925 try:
1926 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001927 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001928 pass
1929 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001930 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001931 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001932 decodedresult = "".join(decoder.decode(bytes([c]))
1933 for c in encodedresult)
1934 self.assertEqual(decodedresult, s,
1935 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001936
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001937 @support.cpython_only
1938 def test_basics_capi(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001939 s = "abc123" # all codecs should be able to encode these
1940 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001941 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001942 # check incremental decoder/encoder (fetched via the C API)
1943 try:
Victor Stinner3d4226a2018-08-29 22:21:32 +02001944 cencoder = _testcapi.codec_incrementalencoder(encoding)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001945 except LookupError: # no IncrementalEncoder
1946 pass
1947 else:
1948 # check C API
1949 encodedresult = b""
1950 for c in s:
1951 encodedresult += cencoder.encode(c)
1952 encodedresult += cencoder.encode("", True)
Victor Stinner3d4226a2018-08-29 22:21:32 +02001953 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001954 decodedresult = ""
1955 for c in encodedresult:
1956 decodedresult += cdecoder.decode(bytes([c]))
1957 decodedresult += cdecoder.decode(b"", True)
1958 self.assertEqual(decodedresult, s,
1959 "encoding=%r" % encoding)
1960
1961 if encoding not in ("idna", "mbcs"):
1962 # check incremental decoder/encoder with errors argument
1963 try:
Victor Stinner3d4226a2018-08-29 22:21:32 +02001964 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001965 except LookupError: # no IncrementalEncoder
1966 pass
1967 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001968 encodedresult = b"".join(cencoder.encode(c) for c in s)
Victor Stinner3d4226a2018-08-29 22:21:32 +02001969 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001970 decodedresult = "".join(cdecoder.decode(bytes([c]))
1971 for c in encodedresult)
1972 self.assertEqual(decodedresult, s,
1973 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001974
Walter Dörwald729c31f2005-03-14 19:06:30 +00001975 def test_seek(self):
1976 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001977 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001978 for encoding in all_unicode_encodings:
1979 if encoding == "idna": # FIXME: See SF bug #1163178
1980 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001981 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00001982 continue
Victor Stinner05010702011-05-27 16:50:40 +02001983 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001984 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001985 # Test that calling seek resets the internal codec state and buffers
1986 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001987 data = reader.read()
1988 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001989
Walter Dörwalde22d3392005-11-17 08:52:34 +00001990 def test_bad_decode_args(self):
1991 for encoding in all_unicode_encodings:
1992 decoder = codecs.getdecoder(encoding)
1993 self.assertRaises(TypeError, decoder)
1994 if encoding not in ("idna", "punycode"):
1995 self.assertRaises(TypeError, decoder, 42)
1996
1997 def test_bad_encode_args(self):
1998 for encoding in all_unicode_encodings:
1999 encoder = codecs.getencoder(encoding)
Inada Naoki6a16b182019-03-18 15:44:11 +09002000 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002001
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002002 def test_encoding_map_type_initialized(self):
2003 from encodings import cp1140
2004 # This used to crash, we are only verifying there's no crash.
2005 table_type = type(cp1140.encoding_table)
2006 self.assertEqual(table_type, table_type)
2007
Walter Dörwald3abcb012007-04-16 22:10:50 +00002008 def test_decoder_state(self):
2009 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002010 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002011 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002012 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002013 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2014 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2015
Victor Stinnerf96418d2015-09-21 23:06:27 +02002016
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002017class CharmapTest(unittest.TestCase):
2018 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002019 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002020 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002021 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002022 )
2023
Ezio Melottib3aedd42010-11-20 19:04:17 +00002024 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002025 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2026 ("\U0010FFFFbc", 3)
2027 )
2028
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002029 self.assertRaises(UnicodeDecodeError,
2030 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2031 )
2032
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002033 self.assertRaises(UnicodeDecodeError,
2034 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2035 )
2036
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002037 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002038 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002039 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002040 )
2041
Ezio Melottib3aedd42010-11-20 19:04:17 +00002042 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002043 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002044 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002045 )
2046
Ezio Melottib3aedd42010-11-20 19:04:17 +00002047 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002048 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2049 ("ab\\x02", 3)
2050 )
2051
2052 self.assertEqual(
2053 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2054 ("ab\\x02", 3)
2055 )
2056
2057 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002058 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002059 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002060 )
2061
Ezio Melottib3aedd42010-11-20 19:04:17 +00002062 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002063 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002064 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002065 )
2066
Guido van Rossum805365e2007-05-07 22:24:25 +00002067 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002068 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002069 codecs.charmap_decode(allbytes, "ignore", ""),
2070 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002071 )
2072
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002073 def test_decode_with_int2str_map(self):
2074 self.assertEqual(
2075 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2076 {0: 'a', 1: 'b', 2: 'c'}),
2077 ("abc", 3)
2078 )
2079
2080 self.assertEqual(
2081 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2082 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2083 ("AaBbCc", 3)
2084 )
2085
2086 self.assertEqual(
2087 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2088 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2089 ("\U0010FFFFbc", 3)
2090 )
2091
2092 self.assertEqual(
2093 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2094 {0: 'a', 1: 'b', 2: ''}),
2095 ("ab", 3)
2096 )
2097
2098 self.assertRaises(UnicodeDecodeError,
2099 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2100 {0: 'a', 1: 'b'}
2101 )
2102
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002103 self.assertRaises(UnicodeDecodeError,
2104 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2105 {0: 'a', 1: 'b', 2: None}
2106 )
2107
2108 # Issue #14850
2109 self.assertRaises(UnicodeDecodeError,
2110 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2111 {0: 'a', 1: 'b', 2: '\ufffe'}
2112 )
2113
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002114 self.assertEqual(
2115 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2116 {0: 'a', 1: 'b'}),
2117 ("ab\ufffd", 3)
2118 )
2119
2120 self.assertEqual(
2121 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2122 {0: 'a', 1: 'b', 2: None}),
2123 ("ab\ufffd", 3)
2124 )
2125
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002126 # Issue #14850
2127 self.assertEqual(
2128 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2129 {0: 'a', 1: 'b', 2: '\ufffe'}),
2130 ("ab\ufffd", 3)
2131 )
2132
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002133 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002134 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2135 {0: 'a', 1: 'b'}),
2136 ("ab\\x02", 3)
2137 )
2138
2139 self.assertEqual(
2140 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2141 {0: 'a', 1: 'b', 2: None}),
2142 ("ab\\x02", 3)
2143 )
2144
2145 # Issue #14850
2146 self.assertEqual(
2147 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2148 {0: 'a', 1: 'b', 2: '\ufffe'}),
2149 ("ab\\x02", 3)
2150 )
2151
2152 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002153 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2154 {0: 'a', 1: 'b'}),
2155 ("ab", 3)
2156 )
2157
2158 self.assertEqual(
2159 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2160 {0: 'a', 1: 'b', 2: None}),
2161 ("ab", 3)
2162 )
2163
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002164 # Issue #14850
2165 self.assertEqual(
2166 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2167 {0: 'a', 1: 'b', 2: '\ufffe'}),
2168 ("ab", 3)
2169 )
2170
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002171 allbytes = bytes(range(256))
2172 self.assertEqual(
2173 codecs.charmap_decode(allbytes, "ignore", {}),
2174 ("", len(allbytes))
2175 )
2176
2177 def test_decode_with_int2int_map(self):
2178 a = ord('a')
2179 b = ord('b')
2180 c = ord('c')
2181
2182 self.assertEqual(
2183 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2184 {0: a, 1: b, 2: c}),
2185 ("abc", 3)
2186 )
2187
2188 # Issue #15379
2189 self.assertEqual(
2190 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2191 {0: 0x10FFFF, 1: b, 2: c}),
2192 ("\U0010FFFFbc", 3)
2193 )
2194
Antoine Pitroua1f76552012-09-23 20:00:04 +02002195 self.assertEqual(
2196 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2197 {0: sys.maxunicode, 1: b, 2: c}),
2198 (chr(sys.maxunicode) + "bc", 3)
2199 )
2200
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002201 self.assertRaises(TypeError,
2202 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002203 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002204 )
2205
2206 self.assertRaises(UnicodeDecodeError,
2207 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2208 {0: a, 1: b},
2209 )
2210
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002211 self.assertRaises(UnicodeDecodeError,
2212 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2213 {0: a, 1: b, 2: 0xFFFE},
2214 )
2215
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002216 self.assertEqual(
2217 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2218 {0: a, 1: b}),
2219 ("ab\ufffd", 3)
2220 )
2221
2222 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002223 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2224 {0: a, 1: b, 2: 0xFFFE}),
2225 ("ab\ufffd", 3)
2226 )
2227
2228 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002229 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2230 {0: a, 1: b}),
2231 ("ab\\x02", 3)
2232 )
2233
2234 self.assertEqual(
2235 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2236 {0: a, 1: b, 2: 0xFFFE}),
2237 ("ab\\x02", 3)
2238 )
2239
2240 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002241 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2242 {0: a, 1: b}),
2243 ("ab", 3)
2244 )
2245
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002246 self.assertEqual(
2247 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2248 {0: a, 1: b, 2: 0xFFFE}),
2249 ("ab", 3)
2250 )
2251
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002252
Thomas Wouters89f507f2006-12-13 04:49:30 +00002253class WithStmtTest(unittest.TestCase):
2254 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002255 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002256 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2257 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002258 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002259
2260 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002261 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002262 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002263 with codecs.StreamReaderWriter(f, info.streamreader,
2264 info.streamwriter, 'strict') as srw:
2265 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002266
Victor Stinnerf96418d2015-09-21 23:06:27 +02002267
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002268class TypesTest(unittest.TestCase):
2269 def test_decode_unicode(self):
2270 # Most decoders don't accept unicode input
2271 decoders = [
2272 codecs.utf_7_decode,
2273 codecs.utf_8_decode,
2274 codecs.utf_16_le_decode,
2275 codecs.utf_16_be_decode,
2276 codecs.utf_16_ex_decode,
2277 codecs.utf_32_decode,
2278 codecs.utf_32_le_decode,
2279 codecs.utf_32_be_decode,
2280 codecs.utf_32_ex_decode,
2281 codecs.latin_1_decode,
2282 codecs.ascii_decode,
2283 codecs.charmap_decode,
2284 ]
2285 if hasattr(codecs, "mbcs_decode"):
2286 decoders.append(codecs.mbcs_decode)
2287 for decoder in decoders:
2288 self.assertRaises(TypeError, decoder, "xxx")
2289
2290 def test_unicode_escape(self):
Martin Panter119e5022016-04-16 09:28:57 +00002291 # Escape-decoding a unicode string is supported and gives the same
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002292 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002293 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2294 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2295 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2296 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002297
Victor Stinnere3b47152011-12-09 20:49:49 +01002298 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2299 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002300 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2301 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002302
2303 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2304 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002305 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2306 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002307
Serhiy Storchakad6793772013-01-29 10:20:44 +02002308
2309class UnicodeEscapeTest(unittest.TestCase):
2310 def test_empty(self):
2311 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2312 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2313
2314 def test_raw_encode(self):
2315 encode = codecs.unicode_escape_encode
2316 for b in range(32, 127):
2317 if b != b'\\'[0]:
2318 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2319
2320 def test_raw_decode(self):
2321 decode = codecs.unicode_escape_decode
2322 for b in range(256):
2323 if b != b'\\'[0]:
2324 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2325
2326 def test_escape_encode(self):
2327 encode = codecs.unicode_escape_encode
2328 check = coding_checker(self, encode)
2329 check('\t', br'\t')
2330 check('\n', br'\n')
2331 check('\r', br'\r')
2332 check('\\', br'\\')
2333 for b in range(32):
2334 if chr(b) not in '\t\n\r':
2335 check(chr(b), ('\\x%02x' % b).encode())
2336 for b in range(127, 256):
2337 check(chr(b), ('\\x%02x' % b).encode())
2338 check('\u20ac', br'\u20ac')
2339 check('\U0001d120', br'\U0001d120')
2340
2341 def test_escape_decode(self):
2342 decode = codecs.unicode_escape_decode
2343 check = coding_checker(self, decode)
2344 check(b"[\\\n]", "[]")
2345 check(br'[\"]', '["]')
2346 check(br"[\']", "[']")
2347 check(br"[\\]", r"[\]")
2348 check(br"[\a]", "[\x07]")
2349 check(br"[\b]", "[\x08]")
2350 check(br"[\t]", "[\x09]")
2351 check(br"[\n]", "[\x0a]")
2352 check(br"[\v]", "[\x0b]")
2353 check(br"[\f]", "[\x0c]")
2354 check(br"[\r]", "[\x0d]")
2355 check(br"[\7]", "[\x07]")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002356 check(br"[\78]", "[\x078]")
2357 check(br"[\41]", "[!]")
2358 check(br"[\418]", "[!8]")
2359 check(br"[\101]", "[A]")
2360 check(br"[\1010]", "[A0]")
2361 check(br"[\x41]", "[A]")
2362 check(br"[\x410]", "[A0]")
2363 check(br"\u20ac", "\u20ac")
2364 check(br"\U0001d120", "\U0001d120")
R David Murray110b6fe2016-09-08 15:34:08 -04002365 for i in range(97, 123):
2366 b = bytes([i])
2367 if b not in b'abfnrtuvx':
2368 with self.assertWarns(DeprecationWarning):
2369 check(b"\\" + b, "\\" + chr(i))
2370 if b.upper() not in b'UN':
2371 with self.assertWarns(DeprecationWarning):
2372 check(b"\\" + b.upper(), "\\" + chr(i-32))
2373 with self.assertWarns(DeprecationWarning):
2374 check(br"\8", "\\8")
2375 with self.assertWarns(DeprecationWarning):
2376 check(br"\9", "\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03002377 with self.assertWarns(DeprecationWarning):
2378 check(b"\\\xfa", "\\\xfa")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002379
2380 def test_decode_errors(self):
2381 decode = codecs.unicode_escape_decode
2382 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2383 for i in range(d):
2384 self.assertRaises(UnicodeDecodeError, decode,
2385 b"\\" + c + b"0"*i)
2386 self.assertRaises(UnicodeDecodeError, decode,
2387 b"[\\" + c + b"0"*i + b"]")
2388 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2389 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2390 self.assertEqual(decode(data, "replace"),
2391 ("[\ufffd]\ufffd", len(data)))
2392 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2393 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2394 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2395
2396
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002397class RawUnicodeEscapeTest(unittest.TestCase):
2398 def test_empty(self):
2399 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2400 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2401
2402 def test_raw_encode(self):
2403 encode = codecs.raw_unicode_escape_encode
2404 for b in range(256):
2405 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2406
2407 def test_raw_decode(self):
2408 decode = codecs.raw_unicode_escape_decode
2409 for b in range(256):
2410 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2411
2412 def test_escape_encode(self):
2413 encode = codecs.raw_unicode_escape_encode
2414 check = coding_checker(self, encode)
2415 for b in range(256):
2416 if b not in b'uU':
2417 check('\\' + chr(b), b'\\' + bytes([b]))
2418 check('\u20ac', br'\u20ac')
2419 check('\U0001d120', br'\U0001d120')
2420
2421 def test_escape_decode(self):
2422 decode = codecs.raw_unicode_escape_decode
2423 check = coding_checker(self, decode)
2424 for b in range(256):
2425 if b not in b'uU':
2426 check(b'\\' + bytes([b]), '\\' + chr(b))
2427 check(br"\u20ac", "\u20ac")
2428 check(br"\U0001d120", "\U0001d120")
2429
2430 def test_decode_errors(self):
2431 decode = codecs.raw_unicode_escape_decode
2432 for c, d in (b'u', 4), (b'U', 4):
2433 for i in range(d):
2434 self.assertRaises(UnicodeDecodeError, decode,
2435 b"\\" + c + b"0"*i)
2436 self.assertRaises(UnicodeDecodeError, decode,
2437 b"[\\" + c + b"0"*i + b"]")
2438 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2439 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2440 self.assertEqual(decode(data, "replace"),
2441 ("[\ufffd]\ufffd", len(data)))
2442 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2443 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2444 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2445
2446
Berker Peksag4a72a7b2016-09-16 17:31:06 +03002447class EscapeEncodeTest(unittest.TestCase):
2448
2449 def test_escape_encode(self):
2450 tests = [
2451 (b'', (b'', 0)),
2452 (b'foobar', (b'foobar', 6)),
2453 (b'spam\0eggs', (b'spam\\x00eggs', 9)),
2454 (b'a\'b', (b"a\\'b", 3)),
2455 (b'b\\c', (b'b\\\\c', 3)),
2456 (b'c\nd', (b'c\\nd', 3)),
2457 (b'd\re', (b'd\\re', 3)),
2458 (b'f\x7fg', (b'f\\x7fg', 3)),
2459 ]
2460 for data, output in tests:
2461 with self.subTest(data=data):
2462 self.assertEqual(codecs.escape_encode(data), output)
2463 self.assertRaises(TypeError, codecs.escape_encode, 'spam')
2464 self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam'))
2465
2466
Martin v. Löwis43c57782009-05-10 08:15:24 +00002467class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002468
2469 def test_utf8(self):
2470 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002471 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002472 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002473 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002474 b"foo\x80bar")
2475 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002476 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002477 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002478 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002479 b"\xed\xb0\x80")
2480
2481 def test_ascii(self):
2482 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002483 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002484 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002485 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002486 b"foo\x80bar")
2487
2488 def test_charmap(self):
2489 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002490 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002491 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002492 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002493 b"foo\xa5bar")
2494
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002495 def test_latin1(self):
2496 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002497 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002498 b"\xe4\xeb\xef\xf6\xfc")
2499
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002500
Victor Stinner3fed0872010-05-22 02:16:27 +00002501class BomTest(unittest.TestCase):
2502 def test_seek0(self):
2503 data = "1234567890"
2504 tests = ("utf-16",
2505 "utf-16-le",
2506 "utf-16-be",
2507 "utf-32",
2508 "utf-32-le",
2509 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002510 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002511 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002512 # Check if the BOM is written only once
2513 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002514 f.write(data)
2515 f.write(data)
2516 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002517 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002518 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002519 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002520
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002521 # Check that the BOM is written after a seek(0)
2522 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2523 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002524 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002525 f.seek(0)
2526 f.write(data)
2527 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002528 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002529
2530 # (StreamWriter) Check that the BOM is written after a seek(0)
2531 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002532 f.writer.write(data[0])
2533 self.assertNotEqual(f.writer.tell(), 0)
2534 f.writer.seek(0)
2535 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002536 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002537 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002538
Victor Stinner05010702011-05-27 16:50:40 +02002539 # Check that the BOM is not written after a seek() at a position
2540 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002541 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2542 f.write(data)
2543 f.seek(f.tell())
2544 f.write(data)
2545 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002546 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002547
Victor Stinner05010702011-05-27 16:50:40 +02002548 # (StreamWriter) Check that the BOM is not written after a seek()
2549 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002550 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002551 f.writer.write(data)
2552 f.writer.seek(f.writer.tell())
2553 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002554 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002555 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002556
Victor Stinner3fed0872010-05-22 02:16:27 +00002557
Georg Brandl02524622010-12-02 18:06:51 +00002558bytes_transform_encodings = [
2559 "base64_codec",
2560 "uu_codec",
2561 "quopri_codec",
2562 "hex_codec",
2563]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002564
2565transform_aliases = {
2566 "base64_codec": ["base64", "base_64"],
2567 "uu_codec": ["uu"],
2568 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2569 "hex_codec": ["hex"],
2570 "rot_13": ["rot13"],
2571}
2572
Georg Brandl02524622010-12-02 18:06:51 +00002573try:
2574 import zlib
2575except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002576 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002577else:
2578 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002579 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002580try:
2581 import bz2
2582except ImportError:
2583 pass
2584else:
2585 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002586 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002587
Victor Stinnerf96418d2015-09-21 23:06:27 +02002588
Georg Brandl02524622010-12-02 18:06:51 +00002589class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002590
Georg Brandl02524622010-12-02 18:06:51 +00002591 def test_basics(self):
2592 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002593 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002594 with self.subTest(encoding=encoding):
2595 # generic codecs interface
2596 (o, size) = codecs.getencoder(encoding)(binput)
2597 self.assertEqual(size, len(binput))
2598 (i, size) = codecs.getdecoder(encoding)(o)
2599 self.assertEqual(size, len(o))
2600 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002601
Georg Brandl02524622010-12-02 18:06:51 +00002602 def test_read(self):
2603 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002604 with self.subTest(encoding=encoding):
2605 sin = codecs.encode(b"\x80", encoding)
2606 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2607 sout = reader.read()
2608 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002609
2610 def test_readline(self):
2611 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002612 with self.subTest(encoding=encoding):
2613 sin = codecs.encode(b"\x80", encoding)
2614 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2615 sout = reader.readline()
2616 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002617
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002618 def test_buffer_api_usage(self):
2619 # We check all the transform codecs accept memoryview input
2620 # for encoding and decoding
2621 # and also that they roundtrip correctly
2622 original = b"12345\x80"
2623 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002624 with self.subTest(encoding=encoding):
2625 data = original
2626 view = memoryview(data)
2627 data = codecs.encode(data, encoding)
2628 view_encoded = codecs.encode(view, encoding)
2629 self.assertEqual(view_encoded, data)
2630 view = memoryview(data)
2631 data = codecs.decode(data, encoding)
2632 self.assertEqual(data, original)
2633 view_decoded = codecs.decode(view, encoding)
2634 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002635
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002636 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002637 # Check binary -> binary codecs give a good error for str input
2638 bad_input = "bad input type"
2639 for encoding in bytes_transform_encodings:
2640 with self.subTest(encoding=encoding):
R David Murray44b548d2016-09-08 13:59:53 -04002641 fmt = (r"{!r} is not a text encoding; "
2642 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002643 msg = fmt.format(encoding)
2644 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002645 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002646 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002647
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002648 def test_text_to_binary_blacklists_text_transforms(self):
2649 # Check str.encode gives a good error message for str -> str codecs
2650 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002651 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002652 with self.assertRaisesRegex(LookupError, msg):
2653 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002654
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002655 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002656 # Check bytes.decode and bytearray.decode give a good error
2657 # message for binary -> binary codecs
2658 data = b"encode first to ensure we meet any format restrictions"
2659 for encoding in bytes_transform_encodings:
2660 with self.subTest(encoding=encoding):
2661 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002662 fmt = (r"{!r} is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002663 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002664 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002665 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002666 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002667 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002668 bytearray(encoded_data).decode(encoding)
2669
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002670 def test_binary_to_text_blacklists_text_transforms(self):
2671 # Check str -> str codec gives a good error for binary input
2672 for bad_input in (b"immutable", bytearray(b"mutable")):
2673 with self.subTest(bad_input=bad_input):
2674 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002675 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002676 with self.assertRaisesRegex(LookupError, msg) as failure:
2677 bad_input.decode("rot_13")
2678 self.assertIsNone(failure.exception.__cause__)
2679
Zachary Wareefa2e042013-12-30 14:54:11 -06002680 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002681 def test_custom_zlib_error_is_wrapped(self):
2682 # Check zlib codec gives a good error for malformed input
2683 msg = "^decoding with 'zlib_codec' codec failed"
2684 with self.assertRaisesRegex(Exception, msg) as failure:
2685 codecs.decode(b"hello", "zlib_codec")
2686 self.assertIsInstance(failure.exception.__cause__,
2687 type(failure.exception))
2688
2689 def test_custom_hex_error_is_wrapped(self):
2690 # Check hex codec gives a good error for malformed input
2691 msg = "^decoding with 'hex_codec' codec failed"
2692 with self.assertRaisesRegex(Exception, msg) as failure:
2693 codecs.decode(b"hello", "hex_codec")
2694 self.assertIsInstance(failure.exception.__cause__,
2695 type(failure.exception))
2696
2697 # Unfortunately, the bz2 module throws OSError, which the codec
2698 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002699
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002700 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2701 def test_aliases(self):
2702 for codec_name, aliases in transform_aliases.items():
2703 expected_name = codecs.lookup(codec_name).name
2704 for alias in aliases:
2705 with self.subTest(alias=alias):
2706 info = codecs.lookup(alias)
2707 self.assertEqual(info.name, expected_name)
2708
Martin Panter06171bd2015-09-12 00:34:28 +00002709 def test_quopri_stateless(self):
2710 # Should encode with quotetabs=True
2711 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2712 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2713 # But should still support unescaped tabs and spaces
2714 unescaped = b"space tab eol\n"
2715 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2716
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002717 def test_uu_invalid(self):
2718 # Missing "begin" line
2719 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2720
Nick Coghlan8b097b42013-11-13 23:49:21 +10002721
2722# The codec system tries to wrap exceptions in order to ensure the error
2723# mentions the operation being performed and the codec involved. We
2724# currently *only* want this to happen for relatively stateless
2725# exceptions, where the only significant information they contain is their
2726# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002727
2728# Use a local codec registry to avoid appearing to leak objects when
Martin Panter119e5022016-04-16 09:28:57 +00002729# registering multiple search functions
Nick Coghlan4e553e22013-11-16 00:35:34 +10002730_TEST_CODECS = {}
2731
2732def _get_test_codec(codec_name):
2733 return _TEST_CODECS.get(codec_name)
2734codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2735
Nick Coghlan8fad1672014-09-15 23:50:44 +12002736try:
2737 # Issue #22166: Also need to clear the internal cache in CPython
2738 from _codecs import _forget_codec
2739except ImportError:
2740 def _forget_codec(codec_name):
2741 pass
2742
2743
Nick Coghlan8b097b42013-11-13 23:49:21 +10002744class ExceptionChainingTest(unittest.TestCase):
2745
2746 def setUp(self):
2747 # There's no way to unregister a codec search function, so we just
2748 # ensure we render this one fairly harmless after the test
2749 # case finishes by using the test case repr as the codec name
2750 # The codecs module normalizes codec names, although this doesn't
2751 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002752 # We also make sure we use a truly unique id for the custom codec
2753 # to avoid issues with the codec cache when running these tests
2754 # multiple times (e.g. when hunting for refleaks)
2755 unique_id = repr(self) + str(id(self))
2756 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2757
2758 # We store the object to raise on the instance because of a bad
2759 # interaction between the codec caching (which means we can't
2760 # recreate the codec entry) and regrtest refleak hunting (which
2761 # runs the same test instance multiple times). This means we
2762 # need to ensure the codecs call back in to the instance to find
2763 # out which exception to raise rather than binding them in a
2764 # closure to an object that may change on the next run
2765 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002766
Nick Coghlan4e553e22013-11-16 00:35:34 +10002767 def tearDown(self):
2768 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002769 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2770 encodings._cache.pop(self.codec_name, None)
2771 try:
2772 _forget_codec(self.codec_name)
2773 except KeyError:
2774 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002775
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002776 def set_codec(self, encode, decode):
2777 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002778 name=self.codec_name)
2779 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002780
2781 @contextlib.contextmanager
2782 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002783 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002784 operation, self.codec_name, exc_type.__name__, msg)
2785 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2786 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002787 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002788 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002789
2790 def raise_obj(self, *args, **kwds):
2791 # Helper to dynamically change the object raised by a test codec
2792 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002793
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002794 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002795 self.obj_to_raise = obj_to_raise
2796 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002797 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002798 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002799 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002800 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002801 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002802 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002803 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002804 codecs.decode(b"bytes input", self.codec_name)
2805
2806 def test_raise_by_type(self):
2807 self.check_wrapped(RuntimeError, "")
2808
2809 def test_raise_by_value(self):
2810 msg = "This should be wrapped"
2811 self.check_wrapped(RuntimeError(msg), msg)
2812
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002813 def test_raise_grandchild_subclass_exact_size(self):
2814 msg = "This should be wrapped"
2815 class MyRuntimeError(RuntimeError):
2816 __slots__ = ()
2817 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2818
2819 def test_raise_subclass_with_weakref_support(self):
2820 msg = "This should be wrapped"
2821 class MyRuntimeError(RuntimeError):
2822 pass
2823 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2824
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002825 def check_not_wrapped(self, obj_to_raise, msg):
2826 def raise_obj(*args, **kwds):
2827 raise obj_to_raise
2828 self.set_codec(raise_obj, raise_obj)
2829 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002830 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002831 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002832 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002833 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002834 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002835 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002836 codecs.decode(b"bytes input", self.codec_name)
2837
2838 def test_init_override_is_not_wrapped(self):
2839 class CustomInit(RuntimeError):
2840 def __init__(self):
2841 pass
2842 self.check_not_wrapped(CustomInit, "")
2843
2844 def test_new_override_is_not_wrapped(self):
2845 class CustomNew(RuntimeError):
2846 def __new__(cls):
2847 return super().__new__(cls)
2848 self.check_not_wrapped(CustomNew, "")
2849
2850 def test_instance_attribute_is_not_wrapped(self):
2851 msg = "This should NOT be wrapped"
2852 exc = RuntimeError(msg)
2853 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002854 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002855
2856 def test_non_str_arg_is_not_wrapped(self):
2857 self.check_not_wrapped(RuntimeError(1), "1")
2858
2859 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002860 msg_re = r"^\('a', 'b', 'c'\)$"
2861 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002862
2863 # http://bugs.python.org/issue19609
2864 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002865 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002866 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002867 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002868 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002869 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002870 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002871 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002872 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002873 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002874 codecs.decode(b"bytes input", self.codec_name)
2875
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002876 def test_unflagged_non_text_codec_handling(self):
2877 # The stdlib non-text codecs are now marked so they're
2878 # pre-emptively skipped by the text model related methods
2879 # However, third party codecs won't be flagged, so we still make
2880 # sure the case where an inappropriate output type is produced is
2881 # handled appropriately
2882 def encode_to_str(*args, **kwds):
2883 return "not bytes!", 0
2884 def decode_to_bytes(*args, **kwds):
2885 return b"not str!", 0
2886 self.set_codec(encode_to_str, decode_to_bytes)
2887 # No input or output type checks on the codecs module functions
2888 encoded = codecs.encode(None, self.codec_name)
2889 self.assertEqual(encoded, "not bytes!")
2890 decoded = codecs.decode(None, self.codec_name)
2891 self.assertEqual(decoded, b"not str!")
2892 # Text model methods should complain
2893 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
R David Murray44b548d2016-09-08 13:59:53 -04002894 r"use codecs.encode\(\) to encode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002895 msg = fmt.format(self.codec_name)
2896 with self.assertRaisesRegex(TypeError, msg):
2897 "str_input".encode(self.codec_name)
2898 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
R David Murray44b548d2016-09-08 13:59:53 -04002899 r"use codecs.decode\(\) to decode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002900 msg = fmt.format(self.codec_name)
2901 with self.assertRaisesRegex(TypeError, msg):
2902 b"bytes input".decode(self.codec_name)
2903
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002904
Georg Brandl02524622010-12-02 18:06:51 +00002905
Victor Stinner62be4fb2011-10-18 21:46:37 +02002906@unittest.skipUnless(sys.platform == 'win32',
2907 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002908class CodePageTest(unittest.TestCase):
2909 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002910
Victor Stinner3a50e702011-10-18 21:21:00 +02002911 def test_invalid_code_page(self):
2912 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2913 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002914 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2915 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002916
2917 def test_code_page_name(self):
2918 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2919 codecs.code_page_encode, 932, '\xff')
2920 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002921 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002922 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002923 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002924
2925 def check_decode(self, cp, tests):
2926 for raw, errors, expected in tests:
2927 if expected is not None:
2928 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01002929 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002930 except UnicodeDecodeError as err:
2931 self.fail('Unable to decode %a from "cp%s" with '
2932 'errors=%r: %s' % (raw, cp, errors, err))
2933 self.assertEqual(decoded[0], expected,
2934 '%a.decode("cp%s", %r)=%a != %a'
2935 % (raw, cp, errors, decoded[0], expected))
2936 # assert 0 <= decoded[1] <= len(raw)
2937 self.assertGreaterEqual(decoded[1], 0)
2938 self.assertLessEqual(decoded[1], len(raw))
2939 else:
2940 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01002941 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002942
2943 def check_encode(self, cp, tests):
2944 for text, errors, expected in tests:
2945 if expected is not None:
2946 try:
2947 encoded = codecs.code_page_encode(cp, text, errors)
2948 except UnicodeEncodeError as err:
2949 self.fail('Unable to encode %a to "cp%s" with '
2950 'errors=%r: %s' % (text, cp, errors, err))
2951 self.assertEqual(encoded[0], expected,
2952 '%a.encode("cp%s", %r)=%a != %a'
2953 % (text, cp, errors, encoded[0], expected))
2954 self.assertEqual(encoded[1], len(text))
2955 else:
2956 self.assertRaises(UnicodeEncodeError,
2957 codecs.code_page_encode, cp, text, errors)
2958
2959 def test_cp932(self):
2960 self.check_encode(932, (
2961 ('abc', 'strict', b'abc'),
2962 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002963 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002964 ('\xff', 'strict', None),
2965 ('[\xff]', 'ignore', b'[]'),
2966 ('[\xff]', 'replace', b'[y]'),
2967 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002968 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02002969 ('[\xff]', 'namereplace',
2970 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002971 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002972 ('\udcff', 'strict', None),
2973 ('[\udcff]', 'surrogateescape', b'[\xff]'),
2974 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002975 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002976 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002977 (b'abc', 'strict', 'abc'),
2978 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2979 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002980 (b'[\xff]', 'strict', None),
2981 (b'[\xff]', 'ignore', '[]'),
2982 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002983 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002984 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002985 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002986 (b'\x81\x00abc', 'strict', None),
2987 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002988 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01002989 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002990 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002991
2992 def test_cp1252(self):
2993 self.check_encode(1252, (
2994 ('abc', 'strict', b'abc'),
2995 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2996 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002997 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002998 ('\u0141', 'strict', None),
2999 ('\u0141', 'ignore', b''),
3000 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003001 ('\udc98', 'surrogateescape', b'\x98'),
3002 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003003 ))
3004 self.check_decode(1252, (
3005 (b'abc', 'strict', 'abc'),
3006 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3007 (b'\xff', 'strict', '\xff'),
3008 ))
3009
3010 def test_cp_utf7(self):
3011 cp = 65000
3012 self.check_encode(cp, (
3013 ('abc', 'strict', b'abc'),
3014 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
3015 ('\U0010ffff', 'strict', b'+2//f/w-'),
3016 ('\udc80', 'strict', b'+3IA-'),
3017 ('\ufffd', 'strict', b'+//0-'),
3018 ))
3019 self.check_decode(cp, (
3020 (b'abc', 'strict', 'abc'),
3021 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3022 (b'+2//f/w-', 'strict', '\U0010ffff'),
3023 (b'+3IA-', 'strict', '\udc80'),
3024 (b'+//0-', 'strict', '\ufffd'),
3025 # invalid bytes
3026 (b'[+/]', 'strict', '[]'),
3027 (b'[\xff]', 'strict', '[\xff]'),
3028 ))
3029
Victor Stinner3a50e702011-10-18 21:21:00 +02003030 def test_multibyte_encoding(self):
3031 self.check_decode(932, (
3032 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3033 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3034 ))
3035 self.check_decode(self.CP_UTF8, (
3036 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3037 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3038 ))
Steve Dowerf5aba582016-09-06 19:42:27 -07003039 self.check_encode(self.CP_UTF8, (
3040 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3041 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3042 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003043
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003044 def test_code_page_decode_flags(self):
3045 # Issue #36312: For some code pages (e.g. UTF-7) flags for
3046 # MultiByteToWideChar() must be set to 0.
Paul Monson62dfd7d2019-04-25 11:36:45 -07003047 if support.verbose:
3048 sys.stdout.write('\n')
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003049 for cp in (50220, 50221, 50222, 50225, 50227, 50229,
3050 *range(57002, 57011+1), 65000):
Paul Monson62dfd7d2019-04-25 11:36:45 -07003051 # On small versions of Windows like Windows IoT
3052 # not all codepages are present.
3053 # A missing codepage causes an OSError exception
3054 # so check for the codepage before decoding
3055 if is_code_page_present(cp):
3056 self.assertEqual(codecs.code_page_decode(cp, b'abc'), ('abc', 3), f'cp{cp}')
3057 else:
3058 if support.verbose:
3059 print(f" skipping cp={cp}")
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003060 self.assertEqual(codecs.code_page_decode(42, b'abc'),
3061 ('\uf061\uf062\uf063', 3))
3062
Victor Stinner3a50e702011-10-18 21:21:00 +02003063 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003064 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3065 self.assertEqual(decoded, ('', 0))
3066
Victor Stinner3a50e702011-10-18 21:21:00 +02003067 decoded = codecs.code_page_decode(932,
3068 b'\xe9\x80\xe9', 'strict',
3069 False)
3070 self.assertEqual(decoded, ('\u9a3e', 2))
3071
3072 decoded = codecs.code_page_decode(932,
3073 b'\xe9\x80\xe9\x80', 'strict',
3074 False)
3075 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3076
3077 decoded = codecs.code_page_decode(932,
3078 b'abc', 'strict',
3079 False)
3080 self.assertEqual(decoded, ('abc', 3))
3081
Steve Dowerf5aba582016-09-06 19:42:27 -07003082 def test_mbcs_alias(self):
3083 # Check that looking up our 'default' codepage will return
3084 # mbcs when we don't have a more specific one available
Victor Stinner91106cd2017-12-13 12:29:09 +01003085 with mock.patch('_winapi.GetACP', return_value=123):
Steve Dowerf5aba582016-09-06 19:42:27 -07003086 codec = codecs.lookup('cp123')
3087 self.assertEqual(codec.name, 'mbcs')
Steve Dowerf5aba582016-09-06 19:42:27 -07003088
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003089 @support.bigmemtest(size=2**31, memuse=7, dry_run=False)
Steve Dower7ebdda02019-08-21 16:22:33 -07003090 def test_large_input(self, size):
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003091 # Test input longer than INT_MAX.
3092 # Input should contain undecodable bytes before and after
3093 # the INT_MAX limit.
Steve Dower7ebdda02019-08-21 16:22:33 -07003094 encoded = (b'01234567' * ((size//8)-1) +
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003095 b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff')
Steve Dower7ebdda02019-08-21 16:22:33 -07003096 self.assertEqual(len(encoded), size+2)
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003097 decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True)
3098 self.assertEqual(decoded[1], len(encoded))
3099 del encoded
3100 self.assertEqual(len(decoded[0]), decoded[1])
3101 self.assertEqual(decoded[0][:10], '0123456701')
3102 self.assertEqual(decoded[0][-20:],
3103 '6701234567'
3104 '\udc85\udc86\udcea\udceb\udcec'
3105 '\udcef\udcfc\udcfd\udcfe\udcff')
3106
Steve Dower7ebdda02019-08-21 16:22:33 -07003107 @support.bigmemtest(size=2**31, memuse=6, dry_run=False)
3108 def test_large_utf8_input(self, size):
3109 # Test input longer than INT_MAX.
3110 # Input should contain a decodable multi-byte character
3111 # surrounding INT_MAX
3112 encoded = (b'0123456\xed\x84\x80' * (size//8))
3113 self.assertEqual(len(encoded), size // 8 * 10)
3114 decoded = codecs.code_page_decode(65001, encoded, 'ignore', True)
3115 self.assertEqual(decoded[1], len(encoded))
3116 del encoded
3117 self.assertEqual(len(decoded[0]), size)
3118 self.assertEqual(decoded[0][:10], '0123456\ud10001')
3119 self.assertEqual(decoded[0][-11:], '56\ud1000123456\ud100')
3120
Victor Stinner3a50e702011-10-18 21:21:00 +02003121
Victor Stinnerf96418d2015-09-21 23:06:27 +02003122class ASCIITest(unittest.TestCase):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003123 def test_encode(self):
3124 self.assertEqual('abc123'.encode('ascii'), b'abc123')
3125
3126 def test_encode_error(self):
3127 for data, error_handler, expected in (
3128 ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3129 ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3130 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003131 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3132 b'[\\x80\\xff\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003133 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3134 ):
3135 with self.subTest(data=data, error_handler=error_handler,
3136 expected=expected):
3137 self.assertEqual(data.encode('ascii', error_handler),
3138 expected)
3139
3140 def test_encode_surrogateescape_error(self):
3141 with self.assertRaises(UnicodeEncodeError):
3142 # the first character can be decoded, but not the second
3143 '\udc80\xff'.encode('ascii', 'surrogateescape')
3144
Victor Stinnerf96418d2015-09-21 23:06:27 +02003145 def test_decode(self):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003146 self.assertEqual(b'abc'.decode('ascii'), 'abc')
3147
3148 def test_decode_error(self):
Victor Stinnerf96418d2015-09-21 23:06:27 +02003149 for data, error_handler, expected in (
3150 (b'[\x80\xff]', 'ignore', '[]'),
3151 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3152 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3153 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3154 ):
3155 with self.subTest(data=data, error_handler=error_handler,
3156 expected=expected):
3157 self.assertEqual(data.decode('ascii', error_handler),
3158 expected)
3159
3160
Victor Stinnerc3713e92015-09-29 12:32:13 +02003161class Latin1Test(unittest.TestCase):
3162 def test_encode(self):
3163 for data, expected in (
3164 ('abc', b'abc'),
3165 ('\x80\xe9\xff', b'\x80\xe9\xff'),
3166 ):
3167 with self.subTest(data=data, expected=expected):
3168 self.assertEqual(data.encode('latin1'), expected)
3169
3170 def test_encode_errors(self):
3171 for data, error_handler, expected in (
3172 ('[\u20ac\udc80]', 'ignore', b'[]'),
3173 ('[\u20ac\udc80]', 'replace', b'[??]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003174 ('[\u20ac\U000abcde]', 'backslashreplace',
3175 b'[\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003176 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3177 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3178 ):
3179 with self.subTest(data=data, error_handler=error_handler,
3180 expected=expected):
3181 self.assertEqual(data.encode('latin1', error_handler),
3182 expected)
3183
3184 def test_encode_surrogateescape_error(self):
3185 with self.assertRaises(UnicodeEncodeError):
3186 # the first character can be decoded, but not the second
3187 '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3188
3189 def test_decode(self):
3190 for data, expected in (
3191 (b'abc', 'abc'),
3192 (b'[\x80\xff]', '[\x80\xff]'),
3193 ):
3194 with self.subTest(data=data, expected=expected):
3195 self.assertEqual(data.decode('latin1'), expected)
3196
3197
Jelle Zijlstrab3be4072019-05-22 08:18:26 -07003198class StreamRecoderTest(unittest.TestCase):
3199 def test_writelines(self):
3200 bio = io.BytesIO()
3201 codec = codecs.lookup('ascii')
3202 sr = codecs.StreamRecoder(bio, codec.encode, codec.decode,
3203 encodings.ascii.StreamReader, encodings.ascii.StreamWriter)
3204 sr.writelines([b'a', b'b'])
3205 self.assertEqual(bio.getvalue(), b'ab')
3206
3207 def test_write(self):
3208 bio = io.BytesIO()
3209 codec = codecs.lookup('latin1')
3210 # Recode from Latin-1 to utf-8.
3211 sr = codecs.StreamRecoder(bio, codec.encode, codec.decode,
3212 encodings.utf_8.StreamReader, encodings.utf_8.StreamWriter)
3213
3214 text = 'àñé'
3215 sr.write(text.encode('latin1'))
3216 self.assertEqual(bio.getvalue(), text.encode('utf-8'))
3217
Ammar Askara6ec1ce2019-05-31 12:44:01 -07003218 def test_seeking_read(self):
3219 bio = io.BytesIO('line1\nline2\nline3\n'.encode('utf-16-le'))
3220 sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le')
3221
3222 self.assertEqual(sr.readline(), b'line1\n')
3223 sr.seek(0)
3224 self.assertEqual(sr.readline(), b'line1\n')
3225 self.assertEqual(sr.readline(), b'line2\n')
3226 self.assertEqual(sr.readline(), b'line3\n')
3227 self.assertEqual(sr.readline(), b'')
3228
3229 def test_seeking_write(self):
3230 bio = io.BytesIO('123456789\n'.encode('utf-16-le'))
3231 sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le')
3232
3233 # Test that seek() only resets its internal buffer when offset
3234 # and whence are zero.
3235 sr.seek(2)
3236 sr.write(b'\nabc\n')
3237 self.assertEqual(sr.readline(), b'789\n')
3238 sr.seek(0)
3239 self.assertEqual(sr.readline(), b'1\n')
3240 self.assertEqual(sr.readline(), b'abc\n')
3241 self.assertEqual(sr.readline(), b'789\n')
3242
Jelle Zijlstrab3be4072019-05-22 08:18:26 -07003243
Victor Stinner3d4226a2018-08-29 22:21:32 +02003244@unittest.skipIf(_testcapi is None, 'need _testcapi module')
3245class LocaleCodecTest(unittest.TestCase):
3246 """
3247 Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex().
3248 """
3249 ENCODING = sys.getfilesystemencoding()
3250 STRINGS = ("ascii", "ulatin1:\xa7\xe9",
3251 "u255:\xff",
3252 "UCS:\xe9\u20ac\U0010ffff",
3253 "surrogates:\uDC80\uDCFF")
3254 BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff")
3255 SURROGATES = "\uDC80\uDCFF"
3256
3257 def encode(self, text, errors="strict"):
3258 return _testcapi.EncodeLocaleEx(text, 0, errors)
3259
3260 def check_encode_strings(self, errors):
3261 for text in self.STRINGS:
3262 with self.subTest(text=text):
3263 try:
3264 expected = text.encode(self.ENCODING, errors)
3265 except UnicodeEncodeError:
3266 with self.assertRaises(RuntimeError) as cm:
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003267 self.encode(text, errors)
Victor Stinner3d4226a2018-08-29 22:21:32 +02003268 errmsg = str(cm.exception)
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003269 self.assertRegex(errmsg, r"encode error: pos=[0-9]+, reason=")
Victor Stinner3d4226a2018-08-29 22:21:32 +02003270 else:
3271 encoded = self.encode(text, errors)
3272 self.assertEqual(encoded, expected)
3273
3274 def test_encode_strict(self):
3275 self.check_encode_strings("strict")
3276
3277 def test_encode_surrogateescape(self):
3278 self.check_encode_strings("surrogateescape")
3279
3280 def test_encode_surrogatepass(self):
3281 try:
3282 self.encode('', 'surrogatepass')
3283 except ValueError as exc:
3284 if str(exc) == 'unsupported error handler':
3285 self.skipTest(f"{self.ENCODING!r} encoder doesn't support "
3286 f"surrogatepass error handler")
3287 else:
3288 raise
3289
3290 self.check_encode_strings("surrogatepass")
3291
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003292 def test_encode_unsupported_error_handler(self):
3293 with self.assertRaises(ValueError) as cm:
3294 self.encode('', 'backslashreplace')
3295 self.assertEqual(str(cm.exception), 'unsupported error handler')
3296
Victor Stinner3d4226a2018-08-29 22:21:32 +02003297 def decode(self, encoded, errors="strict"):
3298 return _testcapi.DecodeLocaleEx(encoded, 0, errors)
3299
3300 def check_decode_strings(self, errors):
3301 is_utf8 = (self.ENCODING == "utf-8")
3302 if is_utf8:
3303 encode_errors = 'surrogateescape'
3304 else:
3305 encode_errors = 'strict'
3306
3307 strings = list(self.BYTES_STRINGS)
3308 for text in self.STRINGS:
3309 try:
3310 encoded = text.encode(self.ENCODING, encode_errors)
3311 if encoded not in strings:
3312 strings.append(encoded)
3313 except UnicodeEncodeError:
3314 encoded = None
3315
3316 if is_utf8:
3317 encoded2 = text.encode(self.ENCODING, 'surrogatepass')
3318 if encoded2 != encoded:
3319 strings.append(encoded2)
3320
3321 for encoded in strings:
3322 with self.subTest(encoded=encoded):
3323 try:
3324 expected = encoded.decode(self.ENCODING, errors)
3325 except UnicodeDecodeError:
3326 with self.assertRaises(RuntimeError) as cm:
3327 self.decode(encoded, errors)
3328 errmsg = str(cm.exception)
3329 self.assertTrue(errmsg.startswith("decode error: "), errmsg)
3330 else:
3331 decoded = self.decode(encoded, errors)
3332 self.assertEqual(decoded, expected)
3333
3334 def test_decode_strict(self):
3335 self.check_decode_strings("strict")
3336
3337 def test_decode_surrogateescape(self):
3338 self.check_decode_strings("surrogateescape")
3339
3340 def test_decode_surrogatepass(self):
3341 try:
3342 self.decode(b'', 'surrogatepass')
3343 except ValueError as exc:
3344 if str(exc) == 'unsupported error handler':
3345 self.skipTest(f"{self.ENCODING!r} decoder doesn't support "
3346 f"surrogatepass error handler")
3347 else:
3348 raise
3349
3350 self.check_decode_strings("surrogatepass")
3351
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003352 def test_decode_unsupported_error_handler(self):
3353 with self.assertRaises(ValueError) as cm:
3354 self.decode(b'', 'backslashreplace')
3355 self.assertEqual(str(cm.exception), 'unsupported error handler')
3356
Victor Stinner3d4226a2018-08-29 22:21:32 +02003357
Zethb3b48c82019-09-09 15:50:36 +01003358class Rot13Test(unittest.TestCase):
3359 """Test the educational ROT-13 codec."""
3360 def test_encode(self):
3361 ciphertext = codecs.encode("Caesar liked ciphers", 'rot-13')
3362 self.assertEqual(ciphertext, 'Pnrfne yvxrq pvcuref')
3363
3364 def test_decode(self):
3365 plaintext = codecs.decode('Rg gh, Oehgr?', 'rot-13')
3366 self.assertEqual(plaintext, 'Et tu, Brute?')
3367
3368 def test_incremental_encode(self):
3369 encoder = codecs.getincrementalencoder('rot-13')()
3370 ciphertext = encoder.encode('ABBA nag Cheryl Baker')
3371 self.assertEqual(ciphertext, 'NOON ant Purely Onxre')
3372
3373 def test_incremental_decode(self):
3374 decoder = codecs.getincrementaldecoder('rot-13')()
3375 plaintext = decoder.decode('terra Ares envy tha')
3376 self.assertEqual(plaintext, 'green Nerf rail gun')
3377
3378
3379class Rot13UtilTest(unittest.TestCase):
3380 """Test the ROT-13 codec via rot13 function,
3381 i.e. the user has done something like:
3382 $ echo "Hello World" | python -m encodings.rot_13
3383 """
3384 def test_rot13_func(self):
3385 infile = io.StringIO('Gb or, be abg gb or, gung vf gur dhrfgvba')
3386 outfile = io.StringIO()
3387 encodings.rot_13.rot13(infile, outfile)
3388 outfile.seek(0)
3389 plain_text = outfile.read()
3390 self.assertEqual(
3391 plain_text,
3392 'To be, or not to be, that is the question')
3393
3394
Fred Drake2e2be372001-09-20 21:33:42 +00003395if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003396 unittest.main()