blob: b37525bf660430414f419fa9329c928255999f5c [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
Nick Coghlanc72e4e62013-11-22 22:39:36 +10007import encodings
Victor Stinner91106cd2017-12-13 12:29:09 +01008from unittest import mock
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Antoine Pitrou00b2c862011-10-05 13:01:41 +020012try:
Victor Stinner3d4226a2018-08-29 22:21:32 +020013 import _testcapi
14except ImportError as exc:
15 _testcapi = None
16
17try:
Antoine Pitrou00b2c862011-10-05 13:01:41 +020018 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Paul Monson62dfd7d2019-04-25 11:36:45 -070030# On small versions of Windows like Windows IoT or Windows Nano Server not all codepages are present
31def is_code_page_present(cp):
Victor Stinner466e18e2019-07-01 19:01:52 +020032 from ctypes import POINTER, WINFUNCTYPE, WinDLL
Paul Monson62dfd7d2019-04-25 11:36:45 -070033 from ctypes.wintypes import BOOL, UINT, BYTE, WCHAR, UINT, DWORD
34
35 MAX_LEADBYTES = 12 # 5 ranges, 2 bytes ea., 0 term.
36 MAX_DEFAULTCHAR = 2 # single or double byte
37 MAX_PATH = 260
38 class CPINFOEXW(ctypes.Structure):
39 _fields_ = [("MaxCharSize", UINT),
40 ("DefaultChar", BYTE*MAX_DEFAULTCHAR),
41 ("LeadByte", BYTE*MAX_LEADBYTES),
42 ("UnicodeDefaultChar", WCHAR),
43 ("CodePage", UINT),
44 ("CodePageName", WCHAR*MAX_PATH)]
45
46 prototype = WINFUNCTYPE(BOOL, UINT, DWORD, POINTER(CPINFOEXW))
47 GetCPInfoEx = prototype(("GetCPInfoExW", WinDLL("kernel32")))
48 info = CPINFOEXW()
49 return GetCPInfoEx(cp, 0, info)
Victor Stinnerf96418d2015-09-21 23:06:27 +020050
Walter Dörwald69652032004-09-07 20:24:22 +000051class Queue(object):
52 """
53 queue: write bytes at one end, read bytes from the other end
54 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000055 def __init__(self, buffer):
56 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000057
58 def write(self, chars):
59 self._buffer += chars
60
61 def read(self, size=-1):
62 if size<0:
63 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000064 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000065 return s
66 else:
67 s = self._buffer[:size]
68 self._buffer = self._buffer[size:]
69 return s
70
Victor Stinnerf96418d2015-09-21 23:06:27 +020071
Walter Dörwald3abcb012007-04-16 22:10:50 +000072class MixInCheckStateHandling:
73 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000074 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000075 d = codecs.getincrementaldecoder(encoding)()
76 part1 = d.decode(s[:i])
77 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000078 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000079 # Check that the condition stated in the documentation for
80 # IncrementalDecoder.getstate() holds
81 if not state[1]:
82 # reset decoder to the default state without anything buffered
83 d.setstate((state[0][:0], 0))
84 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000085 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000086 # The decoder must return to the same state
87 self.assertEqual(state, d.getstate())
88 # Create a new decoder and set it to the state
89 # we extracted from the old one
90 d = codecs.getincrementaldecoder(encoding)()
91 d.setstate(state)
92 part2 = d.decode(s[i:], True)
93 self.assertEqual(u, part1+part2)
94
95 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000096 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000097 d = codecs.getincrementalencoder(encoding)()
98 part1 = d.encode(u[:i])
99 state = d.getstate()
100 d = codecs.getincrementalencoder(encoding)()
101 d.setstate(state)
102 part2 = d.encode(u[i:], True)
103 self.assertEqual(s, part1+part2)
104
Victor Stinnerf96418d2015-09-21 23:06:27 +0200105
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200106class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000107 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +0000108 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000109 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +0000110 # the StreamReader and check that the results equal the appropriate
111 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000112 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200113 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000114 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000115 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000116 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +0000117 result += r.read()
118 self.assertEqual(result, partialresult)
119 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000120 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000121 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +0000122
Martin Panter7462b6492015-11-02 03:37:02 +0000123 # do the check again, this time using an incremental decoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000124 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000125 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000126 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000127 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000128 self.assertEqual(result, partialresult)
129 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000130 self.assertEqual(d.decode(b"", True), "")
131 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000132
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000133 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000134 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000135 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000136 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000137 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000138 self.assertEqual(result, partialresult)
139 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000140 self.assertEqual(d.decode(b"", True), "")
141 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000142
143 # check iterdecode()
144 encoded = input.encode(self.encoding)
145 self.assertEqual(
146 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000147 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000148 )
149
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000150 def test_readline(self):
151 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000152 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000153 return codecs.getreader(self.encoding)(stream)
154
Walter Dörwaldca199432006-03-06 22:39:12 +0000155 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200156 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000157 lines = []
158 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000159 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000160 if not line:
161 break
162 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000163 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000164
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000165 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
166 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
167 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000168 self.assertEqual(readalllines(s, True), sexpected)
169 self.assertEqual(readalllines(s, False), sexpectednoends)
170 self.assertEqual(readalllines(s, True, 10), sexpected)
171 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000172
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200173 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000174 # Test long lines (multiple calls to read() in readline())
175 vw = []
176 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200177 for (i, lineend) in enumerate(lineends):
178 vw.append((i*200+200)*"\u3042" + lineend)
179 vwo.append((i*200+200)*"\u3042")
180 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
181 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000182
183 # Test lines where the first read might end with \r, so the
184 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000185 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200186 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000187 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000188 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000189 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000190 self.assertEqual(
191 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000192 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000193 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200194 self.assertEqual(
195 reader.readline(keepends=True),
196 "xxx\n",
197 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000198 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000199 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000200 self.assertEqual(
201 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000202 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000203 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200204 self.assertEqual(
205 reader.readline(keepends=False),
206 "xxx",
207 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000208
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200209 def test_mixed_readline_and_read(self):
210 lines = ["Humpty Dumpty sat on a wall,\n",
211 "Humpty Dumpty had a great fall.\r\n",
212 "All the king's horses and all the king's men\r",
213 "Couldn't put Humpty together again."]
214 data = ''.join(lines)
215 def getreader():
216 stream = io.BytesIO(data.encode(self.encoding))
217 return codecs.getreader(self.encoding)(stream)
218
219 # Issue #8260: Test readline() followed by read()
220 f = getreader()
221 self.assertEqual(f.readline(), lines[0])
222 self.assertEqual(f.read(), ''.join(lines[1:]))
223 self.assertEqual(f.read(), '')
224
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200225 # Issue #32110: Test readline() followed by read(n)
226 f = getreader()
227 self.assertEqual(f.readline(), lines[0])
228 self.assertEqual(f.read(1), lines[1][0])
229 self.assertEqual(f.read(0), '')
230 self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100])
231
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200232 # Issue #16636: Test readline() followed by readlines()
233 f = getreader()
234 self.assertEqual(f.readline(), lines[0])
235 self.assertEqual(f.readlines(), lines[1:])
236 self.assertEqual(f.read(), '')
237
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200238 # Test read(n) followed by read()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200239 f = getreader()
240 self.assertEqual(f.read(size=40, chars=5), data[:5])
241 self.assertEqual(f.read(), data[5:])
242 self.assertEqual(f.read(), '')
243
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200244 # Issue #32110: Test read(n) followed by read(n)
245 f = getreader()
246 self.assertEqual(f.read(size=40, chars=5), data[:5])
247 self.assertEqual(f.read(1), data[5])
248 self.assertEqual(f.read(0), '')
249 self.assertEqual(f.read(100), data[6:106])
250
251 # Issue #12446: Test read(n) followed by readlines()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200252 f = getreader()
253 self.assertEqual(f.read(size=40, chars=5), data[:5])
254 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
255 self.assertEqual(f.read(), '')
256
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000257 def test_bug1175396(self):
258 s = [
259 '<%!--===================================================\r\n',
260 ' BLOG index page: show recent articles,\r\n',
261 ' today\'s articles, or articles of a specific date.\r\n',
262 '========================================================--%>\r\n',
263 '<%@inputencoding="ISO-8859-1"%>\r\n',
264 '<%@pagetemplate=TEMPLATE.y%>\r\n',
265 '<%@import=import frog.util, frog%>\r\n',
266 '<%@import=import frog.objects%>\r\n',
267 '<%@import=from frog.storageerrors import StorageError%>\r\n',
268 '<%\r\n',
269 '\r\n',
270 'import logging\r\n',
271 'log=logging.getLogger("Snakelets.logger")\r\n',
272 '\r\n',
273 '\r\n',
274 'user=self.SessionCtx.user\r\n',
275 'storageEngine=self.SessionCtx.storageEngine\r\n',
276 '\r\n',
277 '\r\n',
278 'def readArticlesFromDate(date, count=None):\r\n',
279 ' entryids=storageEngine.listBlogEntries(date)\r\n',
280 ' entryids.reverse() # descending\r\n',
281 ' if count:\r\n',
282 ' entryids=entryids[:count]\r\n',
283 ' try:\r\n',
284 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
285 ' except StorageError,x:\r\n',
286 ' log.error("Error loading articles: "+str(x))\r\n',
287 ' self.abort("cannot load articles")\r\n',
288 '\r\n',
289 'showdate=None\r\n',
290 '\r\n',
291 'arg=self.Request.getArg()\r\n',
292 'if arg=="today":\r\n',
293 ' #-------------------- TODAY\'S ARTICLES\r\n',
294 ' self.write("<h2>Today\'s articles</h2>")\r\n',
295 ' showdate = frog.util.isodatestr() \r\n',
296 ' entries = readArticlesFromDate(showdate)\r\n',
297 'elif arg=="active":\r\n',
298 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
299 ' self.Yredirect("active.y")\r\n',
300 'elif arg=="login":\r\n',
301 ' #-------------------- LOGIN PAGE redirect\r\n',
302 ' self.Yredirect("login.y")\r\n',
303 'elif arg=="date":\r\n',
304 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
305 ' showdate = self.Request.getParameter("date")\r\n',
306 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
307 ' entries = readArticlesFromDate(showdate)\r\n',
308 'else:\r\n',
309 ' #-------------------- RECENT ARTICLES\r\n',
310 ' self.write("<h2>Recent articles</h2>")\r\n',
311 ' dates=storageEngine.listBlogEntryDates()\r\n',
312 ' if dates:\r\n',
313 ' entries=[]\r\n',
314 ' SHOWAMOUNT=10\r\n',
315 ' for showdate in dates:\r\n',
316 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
317 ' if len(entries)>=SHOWAMOUNT:\r\n',
318 ' break\r\n',
319 ' \r\n',
320 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000321 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200322 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000323 for (i, line) in enumerate(reader):
324 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000325
326 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000327 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200328 writer = codecs.getwriter(self.encoding)(q)
329 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000330
331 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000332 writer.write("foo\r")
333 self.assertEqual(reader.readline(keepends=False), "foo")
334 writer.write("\nbar\r")
335 self.assertEqual(reader.readline(keepends=False), "")
336 self.assertEqual(reader.readline(keepends=False), "bar")
337 writer.write("baz")
338 self.assertEqual(reader.readline(keepends=False), "baz")
339 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000340
341 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000342 writer.write("foo\r")
343 self.assertEqual(reader.readline(keepends=True), "foo\r")
344 writer.write("\nbar\r")
345 self.assertEqual(reader.readline(keepends=True), "\n")
346 self.assertEqual(reader.readline(keepends=True), "bar\r")
347 writer.write("baz")
348 self.assertEqual(reader.readline(keepends=True), "baz")
349 self.assertEqual(reader.readline(keepends=True), "")
350 writer.write("foo\r\n")
351 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000352
Walter Dörwald9fa09462005-01-10 12:01:39 +0000353 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000354 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
355 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
356 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000357
358 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000359 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200360 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000361 self.assertEqual(reader.readline(), s1)
362 self.assertEqual(reader.readline(), s2)
363 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000364 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000365
366 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000367 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
368 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
369 s3 = "stillokay:bbbbxx\r\n"
370 s4 = "broken!!!!badbad\r\n"
371 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000372
373 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000374 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200375 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000376 self.assertEqual(reader.readline(), s1)
377 self.assertEqual(reader.readline(), s2)
378 self.assertEqual(reader.readline(), s3)
379 self.assertEqual(reader.readline(), s4)
380 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000381 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000382
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200383 ill_formed_sequence_replace = "\ufffd"
384
385 def test_lone_surrogates(self):
386 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
387 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
388 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200389 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
390 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200391 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
392 "[&#56448;]".encode(self.encoding))
393 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
394 "[]".encode(self.encoding))
395 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
396 "[?]".encode(self.encoding))
397
Victor Stinner01ada392015-10-01 21:54:51 +0200398 # sequential surrogate characters
399 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
400 "[]".encode(self.encoding))
401 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
402 "[??]".encode(self.encoding))
403
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200404 bom = "".encode(self.encoding)
405 for before, after in [("\U00010fff", "A"), ("[", "]"),
406 ("A", "\U00010fff")]:
407 before_sequence = before.encode(self.encoding)[len(bom):]
408 after_sequence = after.encode(self.encoding)[len(bom):]
409 test_string = before + "\uDC80" + after
410 test_sequence = (bom + before_sequence +
411 self.ill_formed_sequence + after_sequence)
412 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
413 self.encoding)
414 self.assertEqual(test_string.encode(self.encoding,
415 "surrogatepass"),
416 test_sequence)
417 self.assertEqual(test_sequence.decode(self.encoding,
418 "surrogatepass"),
419 test_string)
420 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
421 before + after)
422 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
423 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200424 backslashreplace = ''.join('\\x%02x' % b
425 for b in self.ill_formed_sequence)
426 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
427 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200428
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +0200429 def test_incremental_surrogatepass(self):
430 # Test incremental decoder for surrogatepass handler:
431 # see issue #24214
Miss Islington (bot)d32594a2019-06-25 02:12:16 -0700432 # High surrogate
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +0200433 data = '\uD901'.encode(self.encoding, 'surrogatepass')
434 for i in range(1, len(data)):
435 dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
436 self.assertEqual(dec.decode(data[:i]), '')
437 self.assertEqual(dec.decode(data[i:], True), '\uD901')
Miss Islington (bot)d32594a2019-06-25 02:12:16 -0700438 # Low surrogate
439 data = '\uDC02'.encode(self.encoding, 'surrogatepass')
440 for i in range(1, len(data)):
441 dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
442 self.assertEqual(dec.decode(data[:i]), '')
443 self.assertEqual(dec.decode(data[i:]), '\uDC02')
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +0200444
Victor Stinnerf96418d2015-09-21 23:06:27 +0200445
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200446class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000447 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200448 if sys.byteorder == 'little':
449 ill_formed_sequence = b"\x80\xdc\x00\x00"
450 else:
451 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000452
453 spamle = (b'\xff\xfe\x00\x00'
454 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
455 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
456 spambe = (b'\x00\x00\xfe\xff'
457 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
458 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
459
460 def test_only_one_bom(self):
461 _,_,reader,writer = codecs.lookup(self.encoding)
462 # encode some stream
463 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200464 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000465 f.write("spam")
466 f.write("spam")
467 d = s.getvalue()
468 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000469 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000470 # try to read it back
471 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200472 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000473 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000474
475 def test_badbom(self):
476 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200477 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000478 self.assertRaises(UnicodeError, f.read)
479
480 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200481 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000482 self.assertRaises(UnicodeError, f.read)
483
484 def test_partial(self):
485 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200486 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000487 [
488 "", # first byte of BOM read
489 "", # second byte of BOM read
490 "", # third byte of BOM read
491 "", # fourth byte of BOM read => byteorder known
492 "",
493 "",
494 "",
495 "\x00",
496 "\x00",
497 "\x00",
498 "\x00",
499 "\x00\xff",
500 "\x00\xff",
501 "\x00\xff",
502 "\x00\xff",
503 "\x00\xff\u0100",
504 "\x00\xff\u0100",
505 "\x00\xff\u0100",
506 "\x00\xff\u0100",
507 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200508 "\x00\xff\u0100\uffff",
509 "\x00\xff\u0100\uffff",
510 "\x00\xff\u0100\uffff",
511 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000512 ]
513 )
514
Georg Brandl791f4e12009-09-17 11:41:24 +0000515 def test_handlers(self):
516 self.assertEqual(('\ufffd', 1),
517 codecs.utf_32_decode(b'\x01', 'replace', True))
518 self.assertEqual(('', 1),
519 codecs.utf_32_decode(b'\x01', 'ignore', True))
520
Walter Dörwald41980ca2007-08-16 21:55:45 +0000521 def test_errors(self):
522 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
523 b"\xff", "strict", True)
524
525 def test_decoder_state(self):
526 self.check_state_handling_decode(self.encoding,
527 "spamspam", self.spamle)
528 self.check_state_handling_decode(self.encoding,
529 "spamspam", self.spambe)
530
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000531 def test_issue8941(self):
532 # Issue #8941: insufficient result allocation when decoding into
533 # surrogate pairs on UCS-2 builds.
534 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
535 self.assertEqual('\U00010000' * 1024,
536 codecs.utf_32_decode(encoded_le)[0])
537 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
538 self.assertEqual('\U00010000' * 1024,
539 codecs.utf_32_decode(encoded_be)[0])
540
Victor Stinnerf96418d2015-09-21 23:06:27 +0200541
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200542class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000543 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200544 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000545
546 def test_partial(self):
547 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200548 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000549 [
550 "",
551 "",
552 "",
553 "\x00",
554 "\x00",
555 "\x00",
556 "\x00",
557 "\x00\xff",
558 "\x00\xff",
559 "\x00\xff",
560 "\x00\xff",
561 "\x00\xff\u0100",
562 "\x00\xff\u0100",
563 "\x00\xff\u0100",
564 "\x00\xff\u0100",
565 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200566 "\x00\xff\u0100\uffff",
567 "\x00\xff\u0100\uffff",
568 "\x00\xff\u0100\uffff",
569 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000570 ]
571 )
572
573 def test_simple(self):
574 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
575
576 def test_errors(self):
577 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
578 b"\xff", "strict", True)
579
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000580 def test_issue8941(self):
581 # Issue #8941: insufficient result allocation when decoding into
582 # surrogate pairs on UCS-2 builds.
583 encoded = b'\x00\x00\x01\x00' * 1024
584 self.assertEqual('\U00010000' * 1024,
585 codecs.utf_32_le_decode(encoded)[0])
586
Victor Stinnerf96418d2015-09-21 23:06:27 +0200587
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200588class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000589 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200590 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000591
592 def test_partial(self):
593 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200594 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000595 [
596 "",
597 "",
598 "",
599 "\x00",
600 "\x00",
601 "\x00",
602 "\x00",
603 "\x00\xff",
604 "\x00\xff",
605 "\x00\xff",
606 "\x00\xff",
607 "\x00\xff\u0100",
608 "\x00\xff\u0100",
609 "\x00\xff\u0100",
610 "\x00\xff\u0100",
611 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200612 "\x00\xff\u0100\uffff",
613 "\x00\xff\u0100\uffff",
614 "\x00\xff\u0100\uffff",
615 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000616 ]
617 )
618
619 def test_simple(self):
620 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
621
622 def test_errors(self):
623 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
624 b"\xff", "strict", True)
625
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000626 def test_issue8941(self):
627 # Issue #8941: insufficient result allocation when decoding into
628 # surrogate pairs on UCS-2 builds.
629 encoded = b'\x00\x01\x00\x00' * 1024
630 self.assertEqual('\U00010000' * 1024,
631 codecs.utf_32_be_decode(encoded)[0])
632
633
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200634class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000635 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200636 if sys.byteorder == 'little':
637 ill_formed_sequence = b"\x80\xdc"
638 else:
639 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000640
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000641 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
642 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000643
644 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000645 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000646 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000647 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200648 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000649 f.write("spam")
650 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000651 d = s.getvalue()
652 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000653 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000654 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000655 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200656 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000657 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000658
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000659 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000660 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200661 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000662 self.assertRaises(UnicodeError, f.read)
663
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000664 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200665 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000666 self.assertRaises(UnicodeError, f.read)
667
Walter Dörwald69652032004-09-07 20:24:22 +0000668 def test_partial(self):
669 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200670 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000671 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000672 "", # first byte of BOM read
673 "", # second byte of BOM read => byteorder known
674 "",
675 "\x00",
676 "\x00",
677 "\x00\xff",
678 "\x00\xff",
679 "\x00\xff\u0100",
680 "\x00\xff\u0100",
681 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200682 "\x00\xff\u0100\uffff",
683 "\x00\xff\u0100\uffff",
684 "\x00\xff\u0100\uffff",
685 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000686 ]
687 )
688
Georg Brandl791f4e12009-09-17 11:41:24 +0000689 def test_handlers(self):
690 self.assertEqual(('\ufffd', 1),
691 codecs.utf_16_decode(b'\x01', 'replace', True))
692 self.assertEqual(('', 1),
693 codecs.utf_16_decode(b'\x01', 'ignore', True))
694
Walter Dörwalde22d3392005-11-17 08:52:34 +0000695 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000696 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000697 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000698
699 def test_decoder_state(self):
700 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000701 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000702 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000703 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000704
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000705 def test_bug691291(self):
706 # Files are always opened in binary mode, even if no binary mode was
707 # specified. This means that no automatic conversion of '\n' is done
708 # on reading and writing.
709 s1 = 'Hello\r\nworld\r\n'
710
711 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200712 self.addCleanup(support.unlink, support.TESTFN)
713 with open(support.TESTFN, 'wb') as fp:
714 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200715 with support.check_warnings(('', DeprecationWarning)):
716 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
717 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200718 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000719
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200720class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000721 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200722 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000723
724 def test_partial(self):
725 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200726 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000727 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000728 "",
729 "\x00",
730 "\x00",
731 "\x00\xff",
732 "\x00\xff",
733 "\x00\xff\u0100",
734 "\x00\xff\u0100",
735 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200736 "\x00\xff\u0100\uffff",
737 "\x00\xff\u0100\uffff",
738 "\x00\xff\u0100\uffff",
739 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000740 ]
741 )
742
Walter Dörwalde22d3392005-11-17 08:52:34 +0000743 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200744 tests = [
745 (b'\xff', '\ufffd'),
746 (b'A\x00Z', 'A\ufffd'),
747 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
748 (b'\x00\xd8', '\ufffd'),
749 (b'\x00\xd8A', '\ufffd'),
750 (b'\x00\xd8A\x00', '\ufffdA'),
751 (b'\x00\xdcA\x00', '\ufffdA'),
752 ]
753 for raw, expected in tests:
754 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
755 raw, 'strict', True)
756 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000757
Victor Stinner53a9dd72010-12-08 22:25:45 +0000758 def test_nonbmp(self):
759 self.assertEqual("\U00010203".encode(self.encoding),
760 b'\x00\xd8\x03\xde')
761 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
762 "\U00010203")
763
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200764class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000765 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200766 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000767
768 def test_partial(self):
769 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200770 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000771 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000772 "",
773 "\x00",
774 "\x00",
775 "\x00\xff",
776 "\x00\xff",
777 "\x00\xff\u0100",
778 "\x00\xff\u0100",
779 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200780 "\x00\xff\u0100\uffff",
781 "\x00\xff\u0100\uffff",
782 "\x00\xff\u0100\uffff",
783 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000784 ]
785 )
786
Walter Dörwalde22d3392005-11-17 08:52:34 +0000787 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200788 tests = [
789 (b'\xff', '\ufffd'),
790 (b'\x00A\xff', 'A\ufffd'),
791 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
792 (b'\xd8\x00', '\ufffd'),
793 (b'\xd8\x00\xdc', '\ufffd'),
794 (b'\xd8\x00\x00A', '\ufffdA'),
795 (b'\xdc\x00\x00A', '\ufffdA'),
796 ]
797 for raw, expected in tests:
798 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
799 raw, 'strict', True)
800 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000801
Victor Stinner53a9dd72010-12-08 22:25:45 +0000802 def test_nonbmp(self):
803 self.assertEqual("\U00010203".encode(self.encoding),
804 b'\xd8\x00\xde\x03')
805 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
806 "\U00010203")
807
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200808class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000809 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200810 ill_formed_sequence = b"\xed\xb2\x80"
811 ill_formed_sequence_replace = "\ufffd" * 3
Victor Stinner01ada392015-10-01 21:54:51 +0200812 BOM = b''
Walter Dörwald69652032004-09-07 20:24:22 +0000813
814 def test_partial(self):
815 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200816 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000817 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000818 "\x00",
819 "\x00",
820 "\x00\xff",
821 "\x00\xff",
822 "\x00\xff\u07ff",
823 "\x00\xff\u07ff",
824 "\x00\xff\u07ff",
825 "\x00\xff\u07ff\u0800",
826 "\x00\xff\u07ff\u0800",
827 "\x00\xff\u07ff\u0800",
828 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200829 "\x00\xff\u07ff\u0800\uffff",
830 "\x00\xff\u07ff\u0800\uffff",
831 "\x00\xff\u07ff\u0800\uffff",
832 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000833 ]
834 )
835
Walter Dörwald3abcb012007-04-16 22:10:50 +0000836 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000837 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000838 self.check_state_handling_decode(self.encoding,
839 u, u.encode(self.encoding))
840
Victor Stinner1d65d912015-10-05 13:43:50 +0200841 def test_decode_error(self):
842 for data, error_handler, expected in (
843 (b'[\x80\xff]', 'ignore', '[]'),
844 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
845 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
846 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
847 ):
848 with self.subTest(data=data, error_handler=error_handler,
849 expected=expected):
850 self.assertEqual(data.decode(self.encoding, error_handler),
851 expected)
852
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000853 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200854 super().test_lone_surrogates()
855 # not sure if this is making sense for
856 # UTF-16 and UTF-32
Victor Stinner01ada392015-10-01 21:54:51 +0200857 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
858 self.BOM + b'[\x80]')
859
860 with self.assertRaises(UnicodeEncodeError) as cm:
861 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
862 exc = cm.exception
863 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000864
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000865 def test_surrogatepass_handler(self):
Victor Stinner01ada392015-10-01 21:54:51 +0200866 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
867 self.BOM + b"abc\xed\xa0\x80def")
868 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
869 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
870 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
871 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
872
873 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
Ezio Melottib3aedd42010-11-20 19:04:17 +0000874 "abc\ud800def")
Victor Stinner01ada392015-10-01 21:54:51 +0200875 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200876 "\U00010fff\uD800")
Victor Stinner01ada392015-10-01 21:54:51 +0200877
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000878 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700879 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200880 b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200881 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200882 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000883
Miss Islington (bot)d32594a2019-06-25 02:12:16 -0700884 def test_incremental_errors(self):
885 # Test that the incremental decoder can fail with final=False.
886 # See issue #24214
887 cases = [b'\x80', b'\xBF', b'\xC0', b'\xC1', b'\xF5', b'\xF6', b'\xFF']
888 for prefix in (b'\xC2', b'\xDF', b'\xE0', b'\xE0\xA0', b'\xEF',
889 b'\xEF\xBF', b'\xF0', b'\xF0\x90', b'\xF0\x90\x80',
890 b'\xF4', b'\xF4\x8F', b'\xF4\x8F\xBF'):
891 for suffix in b'\x7F', b'\xC0':
892 cases.append(prefix + suffix)
893 cases.extend((b'\xE0\x80', b'\xE0\x9F', b'\xED\xA0\x80',
894 b'\xED\xBF\xBF', b'\xF0\x80', b'\xF0\x8F', b'\xF4\x90'))
895
896 for data in cases:
897 with self.subTest(data=data):
898 dec = codecs.getincrementaldecoder(self.encoding)()
899 self.assertRaises(UnicodeDecodeError, dec.decode, data)
900
Victor Stinnerf96418d2015-09-21 23:06:27 +0200901
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200902class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000903 encoding = "utf-7"
904
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300905 def test_ascii(self):
906 # Set D (directly encoded characters)
907 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
908 'abcdefghijklmnopqrstuvwxyz'
909 '0123456789'
910 '\'(),-./:?')
911 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
912 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
913 # Set O (optional direct characters)
914 set_o = ' !"#$%&*;<=>@[]^_`{|}'
915 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
916 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
917 # +
918 self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
919 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
920 # White spaces
921 ws = ' \t\n\r'
922 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
923 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
924 # Other ASCII characters
925 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
926 set(set_d + set_o + '+' + ws)))
927 self.assertEqual(other_ascii.encode(self.encoding),
928 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
929 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
930
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000931 def test_partial(self):
932 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200933 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000934 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200935 'a',
936 'a',
937 'a+',
938 'a+-',
939 'a+-b',
940 'a+-b',
941 'a+-b',
942 'a+-b',
943 'a+-b',
944 'a+-b\x00',
945 'a+-b\x00c',
946 'a+-b\x00c',
947 'a+-b\x00c',
948 'a+-b\x00c',
949 'a+-b\x00c',
950 'a+-b\x00c\x80',
951 'a+-b\x00c\x80d',
952 'a+-b\x00c\x80d',
953 'a+-b\x00c\x80d',
954 'a+-b\x00c\x80d',
955 'a+-b\x00c\x80d',
956 'a+-b\x00c\x80d\u0100',
957 'a+-b\x00c\x80d\u0100e',
958 'a+-b\x00c\x80d\u0100e',
959 'a+-b\x00c\x80d\u0100e',
960 'a+-b\x00c\x80d\u0100e',
961 'a+-b\x00c\x80d\u0100e',
962 'a+-b\x00c\x80d\u0100e',
963 'a+-b\x00c\x80d\u0100e',
964 'a+-b\x00c\x80d\u0100e',
965 'a+-b\x00c\x80d\u0100e\U00010000',
966 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000967 ]
968 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000969
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300970 def test_errors(self):
971 tests = [
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300972 (b'\xffb', '\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300973 (b'a\xffb', 'a\ufffdb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300974 (b'a\xff\xffb', 'a\ufffd\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300975 (b'a+IK', 'a\ufffd'),
976 (b'a+IK-b', 'a\ufffdb'),
977 (b'a+IK,b', 'a\ufffdb'),
978 (b'a+IKx', 'a\u20ac\ufffd'),
979 (b'a+IKx-b', 'a\u20ac\ufffdb'),
980 (b'a+IKwgr', 'a\u20ac\ufffd'),
981 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
982 (b'a+IKwgr,', 'a\u20ac\ufffd'),
983 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
984 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
985 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
986 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
987 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
988 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
989 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300990 (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
991 (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
Zackery Spytze349bf22018-08-18 22:43:38 -0600992 (b'a+@b', 'a\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300993 ]
994 for raw, expected in tests:
995 with self.subTest(raw=raw):
996 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
997 raw, 'strict', True)
998 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
999
1000 def test_nonbmp(self):
1001 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1002 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1003 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001004 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1005 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1006 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1007 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1008 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1009 b'+IKwgrNgB3KA-')
1010 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1011 '\u20ac\u20ac\U000104A0')
1012 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1013 '\u20ac\u20ac\U000104A0')
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001014
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001015 def test_lone_surrogates(self):
1016 tests = [
1017 (b'a+2AE-b', 'a\ud801b'),
1018 (b'a+2AE\xffb', 'a\ufffdb'),
1019 (b'a+2AE', 'a\ufffd'),
1020 (b'a+2AEA-b', 'a\ufffdb'),
1021 (b'a+2AH-b', 'a\ufffdb'),
1022 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1023 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1024 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1025 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1026 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1027 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1028 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1029 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1030 ]
1031 for raw, expected in tests:
1032 with self.subTest(raw=raw):
1033 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001034
1035
Walter Dörwalde22d3392005-11-17 08:52:34 +00001036class UTF16ExTest(unittest.TestCase):
1037
1038 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001039 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001040
1041 def test_bad_args(self):
1042 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1043
1044class ReadBufferTest(unittest.TestCase):
1045
1046 def test_array(self):
1047 import array
1048 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001049 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +00001050 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001051 )
1052
1053 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +00001054 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +00001055
1056 def test_bad_args(self):
1057 self.assertRaises(TypeError, codecs.readbuffer_encode)
1058 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1059
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001060class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001061 encoding = "utf-8-sig"
Victor Stinner01ada392015-10-01 21:54:51 +02001062 BOM = codecs.BOM_UTF8
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001063
1064 def test_partial(self):
1065 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001066 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001067 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001068 "",
1069 "",
1070 "", # First BOM has been read and skipped
1071 "",
1072 "",
1073 "\ufeff", # Second BOM has been read and emitted
1074 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001075 "\ufeff\x00", # First byte of encoded "\xff" read
1076 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1077 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1078 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001079 "\ufeff\x00\xff\u07ff",
1080 "\ufeff\x00\xff\u07ff",
1081 "\ufeff\x00\xff\u07ff\u0800",
1082 "\ufeff\x00\xff\u07ff\u0800",
1083 "\ufeff\x00\xff\u07ff\u0800",
1084 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001085 "\ufeff\x00\xff\u07ff\u0800\uffff",
1086 "\ufeff\x00\xff\u07ff\u0800\uffff",
1087 "\ufeff\x00\xff\u07ff\u0800\uffff",
1088 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001089 ]
1090 )
1091
Thomas Wouters89f507f2006-12-13 04:49:30 +00001092 def test_bug1601501(self):
1093 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001094 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001095
Walter Dörwald3abcb012007-04-16 22:10:50 +00001096 def test_bom(self):
1097 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001098 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001099 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1100
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001101 def test_stream_bom(self):
1102 unistring = "ABC\u00A1\u2200XYZ"
1103 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1104
1105 reader = codecs.getreader("utf-8-sig")
1106 for sizehint in [None] + list(range(1, 11)) + \
1107 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001108 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001109 ostream = io.StringIO()
1110 while 1:
1111 if sizehint is not None:
1112 data = istream.read(sizehint)
1113 else:
1114 data = istream.read()
1115
1116 if not data:
1117 break
1118 ostream.write(data)
1119
1120 got = ostream.getvalue()
1121 self.assertEqual(got, unistring)
1122
1123 def test_stream_bare(self):
1124 unistring = "ABC\u00A1\u2200XYZ"
1125 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1126
1127 reader = codecs.getreader("utf-8-sig")
1128 for sizehint in [None] + list(range(1, 11)) + \
1129 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001130 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001131 ostream = io.StringIO()
1132 while 1:
1133 if sizehint is not None:
1134 data = istream.read(sizehint)
1135 else:
1136 data = istream.read()
1137
1138 if not data:
1139 break
1140 ostream.write(data)
1141
1142 got = ostream.getvalue()
1143 self.assertEqual(got, unistring)
1144
1145class EscapeDecodeTest(unittest.TestCase):
1146 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001147 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001148 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001149
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001150 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001151 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001152 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001153 b = bytes([b])
1154 if b != b'\\':
1155 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001156
1157 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001158 decode = codecs.escape_decode
1159 check = coding_checker(self, decode)
1160 check(b"[\\\n]", b"[]")
1161 check(br'[\"]', b'["]')
1162 check(br"[\']", b"[']")
R David Murray110b6fe2016-09-08 15:34:08 -04001163 check(br"[\\]", b"[\\]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001164 check(br"[\a]", b"[\x07]")
1165 check(br"[\b]", b"[\x08]")
1166 check(br"[\t]", b"[\x09]")
1167 check(br"[\n]", b"[\x0a]")
1168 check(br"[\v]", b"[\x0b]")
1169 check(br"[\f]", b"[\x0c]")
1170 check(br"[\r]", b"[\x0d]")
1171 check(br"[\7]", b"[\x07]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001172 check(br"[\78]", b"[\x078]")
1173 check(br"[\41]", b"[!]")
1174 check(br"[\418]", b"[!8]")
1175 check(br"[\101]", b"[A]")
1176 check(br"[\1010]", b"[A0]")
1177 check(br"[\501]", b"[A]")
1178 check(br"[\x41]", b"[A]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001179 check(br"[\x410]", b"[A0]")
R David Murray110b6fe2016-09-08 15:34:08 -04001180 for i in range(97, 123):
1181 b = bytes([i])
1182 if b not in b'abfnrtvx':
1183 with self.assertWarns(DeprecationWarning):
1184 check(b"\\" + b, b"\\" + b)
1185 with self.assertWarns(DeprecationWarning):
1186 check(b"\\" + b.upper(), b"\\" + b.upper())
1187 with self.assertWarns(DeprecationWarning):
1188 check(br"\8", b"\\8")
1189 with self.assertWarns(DeprecationWarning):
1190 check(br"\9", b"\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03001191 with self.assertWarns(DeprecationWarning):
1192 check(b"\\\xfa", b"\\\xfa")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001193
1194 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001195 decode = codecs.escape_decode
1196 self.assertRaises(ValueError, decode, br"\x")
1197 self.assertRaises(ValueError, decode, br"[\x]")
1198 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1199 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1200 self.assertRaises(ValueError, decode, br"\x0")
1201 self.assertRaises(ValueError, decode, br"[\x0]")
1202 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1203 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001204
Victor Stinnerf96418d2015-09-21 23:06:27 +02001205
Martin v. Löwis2548c732003-04-18 10:39:54 +00001206# From RFC 3492
1207punycode_testcases = [
1208 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001209 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1210 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001211 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001212 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001213 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001214 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001215 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001216 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001217 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001218 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001219 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1220 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1221 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001222 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001223 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001224 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1225 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1226 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001227 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001228 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001229 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001230 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1231 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1232 "\u0939\u0948\u0902",
1233 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001234
1235 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001236 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001237 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1238 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001239
1240 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001241 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1242 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1243 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001244 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1245 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001246
1247 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001248 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1249 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1250 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1251 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001252 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001253
1254 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001255 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1256 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1257 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1258 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1259 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001260 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001261
1262 # (K) Vietnamese:
1263 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1264 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001265 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1266 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1267 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1268 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001269 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001270
Martin v. Löwis2548c732003-04-18 10:39:54 +00001271 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001272 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001273 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001274
Martin v. Löwis2548c732003-04-18 10:39:54 +00001275 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001276 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1277 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1278 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001279 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001280
1281 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001282 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1283 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1284 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001285 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001286
1287 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001288 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001289 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001290
1291 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001292 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1293 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001294 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001295
1296 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001297 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001298 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001299
1300 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001301 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001302 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001303
1304 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001305 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1306 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001307 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001308 ]
1309
1310for i in punycode_testcases:
1311 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001312 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001313
Victor Stinnerf96418d2015-09-21 23:06:27 +02001314
Martin v. Löwis2548c732003-04-18 10:39:54 +00001315class PunycodeTest(unittest.TestCase):
1316 def test_encode(self):
1317 for uni, puny in punycode_testcases:
1318 # Need to convert both strings to lower case, since
1319 # some of the extended encodings use upper case, but our
1320 # code produces only lower case. Converting just puny to
1321 # lower is also insufficient, since some of the input characters
1322 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001323 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001324 str(uni.encode("punycode"), "ascii").lower(),
1325 str(puny, "ascii").lower()
1326 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001327
1328 def test_decode(self):
1329 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001330 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001331 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001332 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001333
Victor Stinnerf96418d2015-09-21 23:06:27 +02001334
Martin v. Löwis2548c732003-04-18 10:39:54 +00001335# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1336nameprep_tests = [
1337 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001338 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1339 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1340 b'\xb8\x8f\xef\xbb\xbf',
1341 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001342 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001343 (b'CAFE',
1344 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001345 # 3.3 Case folding 8bit U+00DF (german sharp s).
1346 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001347 (b'\xc3\x9f',
1348 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001349 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001350 (b'\xc4\xb0',
1351 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001352 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001353 (b'\xc5\x83\xcd\xba',
1354 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001355 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1356 # XXX: skip this as it fails in UCS-2 mode
1357 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1358 # 'telc\xe2\x88\x95kg\xcf\x83'),
1359 (None, None),
1360 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001361 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1362 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001363 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001364 (b'\xe1\xbe\xb7',
1365 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001366 # 3.9 Self-reverting case folding U+01F0 and normalization.
1367 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001368 (b'\xc7\xb0',
1369 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001370 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001371 (b'\xce\x90',
1372 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001373 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001374 (b'\xce\xb0',
1375 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001376 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001377 (b'\xe1\xba\x96',
1378 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001379 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001380 (b'\xe1\xbd\x96',
1381 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001382 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001383 (b' ',
1384 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001385 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001386 (b'\xc2\xa0',
1387 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001388 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001389 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001390 None),
1391 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001392 (b'\xe2\x80\x80',
1393 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001394 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001395 (b'\xe2\x80\x8b',
1396 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001397 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001398 (b'\xe3\x80\x80',
1399 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001400 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001401 (b'\x10\x7f',
1402 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001403 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001404 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001405 None),
1406 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001407 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001408 None),
1409 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001410 (b'\xef\xbb\xbf',
1411 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001412 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001413 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001414 None),
1415 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001416 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001417 None),
1418 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001419 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001420 None),
1421 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001422 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001423 None),
1424 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001425 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001426 None),
1427 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001428 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001429 None),
1430 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001431 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001432 None),
1433 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001434 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001435 None),
1436 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001437 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001438 None),
1439 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001440 (b'\xcd\x81',
1441 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001442 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001443 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001444 None),
1445 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001446 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001447 None),
1448 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001449 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001450 None),
1451 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001452 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001453 None),
1454 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001455 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001456 None),
1457 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001458 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001459 None),
1460 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001461 (b'foo\xef\xb9\xb6bar',
1462 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001463 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001464 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001465 None),
1466 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001467 (b'\xd8\xa71\xd8\xa8',
1468 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001469 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001470 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001471 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001472 # None),
1473 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001474 # 3.44 Larger test (shrinking).
1475 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001476 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1477 b'\xaa\xce\xb0\xe2\x80\x80',
1478 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001479 # 3.45 Larger test (expanding).
1480 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001481 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1482 b'\x80',
1483 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1484 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1485 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001486 ]
1487
1488
1489class NameprepTest(unittest.TestCase):
1490 def test_nameprep(self):
1491 from encodings.idna import nameprep
1492 for pos, (orig, prepped) in enumerate(nameprep_tests):
1493 if orig is None:
1494 # Skipped
1495 continue
1496 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001497 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001498 if prepped is None:
1499 # Input contains prohibited characters
1500 self.assertRaises(UnicodeError, nameprep, orig)
1501 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001502 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001503 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001504 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001505 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001506 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001507
Victor Stinnerf96418d2015-09-21 23:06:27 +02001508
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001509class IDNACodecTest(unittest.TestCase):
1510 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001511 self.assertEqual(str(b"python.org", "idna"), "python.org")
1512 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1513 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1514 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001515
1516 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001517 self.assertEqual("python.org".encode("idna"), b"python.org")
1518 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1519 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1520 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001521
Martin v. Löwis8b595142005-08-25 11:03:38 +00001522 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001523 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001524 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001525 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001526
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001527 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001528 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001529 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001530 "python.org"
1531 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001532 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001533 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001534 "python.org."
1535 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001536 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001537 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001538 "pyth\xf6n.org."
1539 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001540 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001541 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001542 "pyth\xf6n.org."
1543 )
1544
1545 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001546 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1547 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1548 self.assertEqual(decoder.decode(b"rg"), "")
1549 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001550
1551 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001552 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1553 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1554 self.assertEqual(decoder.decode(b"rg."), "org.")
1555 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001556
1557 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001558 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001559 b"".join(codecs.iterencode("python.org", "idna")),
1560 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001561 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001562 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001563 b"".join(codecs.iterencode("python.org.", "idna")),
1564 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001565 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001566 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001567 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1568 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001569 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001570 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001571 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1572 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001573 )
1574
1575 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001576 self.assertEqual(encoder.encode("\xe4x"), b"")
1577 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1578 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001579
1580 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001581 self.assertEqual(encoder.encode("\xe4x"), b"")
1582 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1583 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001584
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001585 def test_errors(self):
1586 """Only supports "strict" error handler"""
1587 "python.org".encode("idna", "strict")
1588 b"python.org".decode("idna", "strict")
1589 for errors in ("ignore", "replace", "backslashreplace",
1590 "surrogateescape"):
1591 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1592 self.assertRaises(Exception,
1593 b"python.org".decode, "idna", errors)
1594
Victor Stinnerf96418d2015-09-21 23:06:27 +02001595
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001596class CodecsModuleTest(unittest.TestCase):
1597
1598 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001599 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1600 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001601 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001602 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001603 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001604
Victor Stinnera57dfd02014-05-14 17:13:14 +02001605 # test keywords
1606 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1607 '\xe4\xf6\xfc')
1608 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1609 '[]')
1610
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001611 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001612 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1613 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001614 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001615 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001616 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001617 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001618
Victor Stinnera57dfd02014-05-14 17:13:14 +02001619 # test keywords
1620 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1621 b'\xe4\xf6\xfc')
1622 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1623 b'[]')
1624
Walter Dörwald063e1e82004-10-28 13:04:26 +00001625 def test_register(self):
1626 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001627 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001628
1629 def test_lookup(self):
1630 self.assertRaises(TypeError, codecs.lookup)
1631 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001632 self.assertRaises(LookupError, codecs.lookup, " ")
1633
1634 def test_getencoder(self):
1635 self.assertRaises(TypeError, codecs.getencoder)
1636 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1637
1638 def test_getdecoder(self):
1639 self.assertRaises(TypeError, codecs.getdecoder)
1640 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1641
1642 def test_getreader(self):
1643 self.assertRaises(TypeError, codecs.getreader)
1644 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1645
1646 def test_getwriter(self):
1647 self.assertRaises(TypeError, codecs.getwriter)
1648 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001649
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001650 def test_lookup_issue1813(self):
1651 # Issue #1813: under Turkish locales, lookup of some codecs failed
1652 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001653 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001654 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1655 try:
1656 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1657 except locale.Error:
1658 # Unsupported locale on this system
1659 self.skipTest('test needs Turkish locale')
1660 c = codecs.lookup('ASCII')
1661 self.assertEqual(c.name, 'ascii')
1662
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001663 def test_all(self):
1664 api = (
1665 "encode", "decode",
1666 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1667 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1668 "getencoder", "getdecoder", "getincrementalencoder",
1669 "getincrementaldecoder", "getreader", "getwriter",
1670 "register_error", "lookup_error",
1671 "strict_errors", "replace_errors", "ignore_errors",
1672 "xmlcharrefreplace_errors", "backslashreplace_errors",
1673 "namereplace_errors",
1674 "open", "EncodedFile",
1675 "iterencode", "iterdecode",
1676 "BOM", "BOM_BE", "BOM_LE",
1677 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1678 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1679 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1680 "StreamReaderWriter", "StreamRecoder",
1681 )
1682 self.assertCountEqual(api, codecs.__all__)
1683 for api in codecs.__all__:
1684 getattr(codecs, api)
1685
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001686 def test_open(self):
1687 self.addCleanup(support.unlink, support.TESTFN)
1688 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1689 with self.subTest(mode), \
1690 codecs.open(support.TESTFN, mode, 'ascii') as file:
1691 self.assertIsInstance(file, codecs.StreamReaderWriter)
1692
1693 def test_undefined(self):
1694 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1695 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1696 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1697 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1698 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1699 self.assertRaises(UnicodeError,
1700 codecs.encode, 'abc', 'undefined', errors)
1701 self.assertRaises(UnicodeError,
1702 codecs.decode, b'abc', 'undefined', errors)
1703
Victor Stinnerf96418d2015-09-21 23:06:27 +02001704
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001705class StreamReaderTest(unittest.TestCase):
1706
1707 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001708 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001709 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001710
1711 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001712 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001713 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001714
Victor Stinnerf96418d2015-09-21 23:06:27 +02001715
Thomas Wouters89f507f2006-12-13 04:49:30 +00001716class EncodedFileTest(unittest.TestCase):
1717
1718 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001719 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001720 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001721 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001722
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001723 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001724 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001725 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001726 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001727
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001728all_unicode_encodings = [
1729 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001730 "big5",
1731 "big5hkscs",
1732 "charmap",
1733 "cp037",
1734 "cp1006",
1735 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001736 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001737 "cp1140",
1738 "cp1250",
1739 "cp1251",
1740 "cp1252",
1741 "cp1253",
1742 "cp1254",
1743 "cp1255",
1744 "cp1256",
1745 "cp1257",
1746 "cp1258",
1747 "cp424",
1748 "cp437",
1749 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001750 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001751 "cp737",
1752 "cp775",
1753 "cp850",
1754 "cp852",
1755 "cp855",
1756 "cp856",
1757 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001758 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001759 "cp860",
1760 "cp861",
1761 "cp862",
1762 "cp863",
1763 "cp864",
1764 "cp865",
1765 "cp866",
1766 "cp869",
1767 "cp874",
1768 "cp875",
1769 "cp932",
1770 "cp949",
1771 "cp950",
1772 "euc_jis_2004",
1773 "euc_jisx0213",
1774 "euc_jp",
1775 "euc_kr",
1776 "gb18030",
1777 "gb2312",
1778 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001779 "hp_roman8",
1780 "hz",
1781 "idna",
1782 "iso2022_jp",
1783 "iso2022_jp_1",
1784 "iso2022_jp_2",
1785 "iso2022_jp_2004",
1786 "iso2022_jp_3",
1787 "iso2022_jp_ext",
1788 "iso2022_kr",
1789 "iso8859_1",
1790 "iso8859_10",
1791 "iso8859_11",
1792 "iso8859_13",
1793 "iso8859_14",
1794 "iso8859_15",
1795 "iso8859_16",
1796 "iso8859_2",
1797 "iso8859_3",
1798 "iso8859_4",
1799 "iso8859_5",
1800 "iso8859_6",
1801 "iso8859_7",
1802 "iso8859_8",
1803 "iso8859_9",
1804 "johab",
1805 "koi8_r",
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03001806 "koi8_t",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001807 "koi8_u",
Serhiy Storchakaad8a1c32015-05-12 23:16:55 +03001808 "kz1048",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001809 "latin_1",
1810 "mac_cyrillic",
1811 "mac_greek",
1812 "mac_iceland",
1813 "mac_latin2",
1814 "mac_roman",
1815 "mac_turkish",
1816 "palmos",
1817 "ptcp154",
1818 "punycode",
1819 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001820 "shift_jis",
1821 "shift_jis_2004",
1822 "shift_jisx0213",
1823 "tis_620",
1824 "unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001825 "utf_16",
1826 "utf_16_be",
1827 "utf_16_le",
1828 "utf_7",
1829 "utf_8",
1830]
1831
1832if hasattr(codecs, "mbcs_encode"):
1833 all_unicode_encodings.append("mbcs")
Steve Dowerf5aba582016-09-06 19:42:27 -07001834if hasattr(codecs, "oem_encode"):
1835 all_unicode_encodings.append("oem")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001836
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001837# The following encoding is not tested, because it's not supposed
1838# to work:
1839# "undefined"
1840
1841# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001842broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001843 "punycode",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001844]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001845
Victor Stinnerf96418d2015-09-21 23:06:27 +02001846
Walter Dörwald3abcb012007-04-16 22:10:50 +00001847class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001848 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001849 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001850 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001851 name = codecs.lookup(encoding).name
1852 if encoding.endswith("_codec"):
1853 name += "_codec"
1854 elif encoding == "latin_1":
1855 name = "latin_1"
1856 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001857
Inada Naoki6a16b182019-03-18 15:44:11 +09001858 (b, size) = codecs.getencoder(encoding)(s)
1859 self.assertEqual(size, len(s), "encoding=%r" % encoding)
1860 (chars, size) = codecs.getdecoder(encoding)(b)
1861 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001862
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001863 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001864 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001865 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001866 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001867 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001868 for c in s:
1869 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001870 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001871 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001872 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001873 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001874 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001875 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001876 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001877 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001878 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001879 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001880
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001881 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001882 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001883 try:
1884 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001885 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001886 pass
1887 else:
1888 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001889 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001890 for c in s:
1891 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001892 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001893 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001894 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001895 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001896 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001897 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001898 self.assertEqual(decodedresult, s,
1899 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001900
1901 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001902 result = "".join(codecs.iterdecode(
1903 codecs.iterencode(s, encoding), encoding))
1904 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001905
1906 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001907 result = "".join(codecs.iterdecode(
1908 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001909 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001910
Victor Stinner554f3f02010-06-16 23:33:54 +00001911 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001912 # check incremental decoder/encoder with errors argument
1913 try:
1914 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001915 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001916 pass
1917 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001918 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001919 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001920 decodedresult = "".join(decoder.decode(bytes([c]))
1921 for c in encodedresult)
1922 self.assertEqual(decodedresult, s,
1923 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001924
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001925 @support.cpython_only
1926 def test_basics_capi(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001927 s = "abc123" # all codecs should be able to encode these
1928 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001929 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001930 # check incremental decoder/encoder (fetched via the C API)
1931 try:
Victor Stinner3d4226a2018-08-29 22:21:32 +02001932 cencoder = _testcapi.codec_incrementalencoder(encoding)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001933 except LookupError: # no IncrementalEncoder
1934 pass
1935 else:
1936 # check C API
1937 encodedresult = b""
1938 for c in s:
1939 encodedresult += cencoder.encode(c)
1940 encodedresult += cencoder.encode("", True)
Victor Stinner3d4226a2018-08-29 22:21:32 +02001941 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001942 decodedresult = ""
1943 for c in encodedresult:
1944 decodedresult += cdecoder.decode(bytes([c]))
1945 decodedresult += cdecoder.decode(b"", True)
1946 self.assertEqual(decodedresult, s,
1947 "encoding=%r" % encoding)
1948
1949 if encoding not in ("idna", "mbcs"):
1950 # check incremental decoder/encoder with errors argument
1951 try:
Victor Stinner3d4226a2018-08-29 22:21:32 +02001952 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001953 except LookupError: # no IncrementalEncoder
1954 pass
1955 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001956 encodedresult = b"".join(cencoder.encode(c) for c in s)
Victor Stinner3d4226a2018-08-29 22:21:32 +02001957 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001958 decodedresult = "".join(cdecoder.decode(bytes([c]))
1959 for c in encodedresult)
1960 self.assertEqual(decodedresult, s,
1961 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001962
Walter Dörwald729c31f2005-03-14 19:06:30 +00001963 def test_seek(self):
1964 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001965 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001966 for encoding in all_unicode_encodings:
1967 if encoding == "idna": # FIXME: See SF bug #1163178
1968 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001969 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00001970 continue
Victor Stinner05010702011-05-27 16:50:40 +02001971 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001972 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001973 # Test that calling seek resets the internal codec state and buffers
1974 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001975 data = reader.read()
1976 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001977
Walter Dörwalde22d3392005-11-17 08:52:34 +00001978 def test_bad_decode_args(self):
1979 for encoding in all_unicode_encodings:
1980 decoder = codecs.getdecoder(encoding)
1981 self.assertRaises(TypeError, decoder)
1982 if encoding not in ("idna", "punycode"):
1983 self.assertRaises(TypeError, decoder, 42)
1984
1985 def test_bad_encode_args(self):
1986 for encoding in all_unicode_encodings:
1987 encoder = codecs.getencoder(encoding)
Inada Naoki6a16b182019-03-18 15:44:11 +09001988 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001989
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001990 def test_encoding_map_type_initialized(self):
1991 from encodings import cp1140
1992 # This used to crash, we are only verifying there's no crash.
1993 table_type = type(cp1140.encoding_table)
1994 self.assertEqual(table_type, table_type)
1995
Walter Dörwald3abcb012007-04-16 22:10:50 +00001996 def test_decoder_state(self):
1997 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001998 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001999 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002000 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002001 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2002 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2003
Victor Stinnerf96418d2015-09-21 23:06:27 +02002004
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002005class CharmapTest(unittest.TestCase):
2006 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002007 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002008 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002009 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002010 )
2011
Ezio Melottib3aedd42010-11-20 19:04:17 +00002012 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002013 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2014 ("\U0010FFFFbc", 3)
2015 )
2016
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002017 self.assertRaises(UnicodeDecodeError,
2018 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2019 )
2020
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002021 self.assertRaises(UnicodeDecodeError,
2022 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2023 )
2024
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002025 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002026 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002027 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002028 )
2029
Ezio Melottib3aedd42010-11-20 19:04:17 +00002030 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002031 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002032 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002033 )
2034
Ezio Melottib3aedd42010-11-20 19:04:17 +00002035 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002036 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2037 ("ab\\x02", 3)
2038 )
2039
2040 self.assertEqual(
2041 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2042 ("ab\\x02", 3)
2043 )
2044
2045 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002046 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002047 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002048 )
2049
Ezio Melottib3aedd42010-11-20 19:04:17 +00002050 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002051 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002052 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002053 )
2054
Guido van Rossum805365e2007-05-07 22:24:25 +00002055 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002056 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002057 codecs.charmap_decode(allbytes, "ignore", ""),
2058 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002059 )
2060
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002061 def test_decode_with_int2str_map(self):
2062 self.assertEqual(
2063 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2064 {0: 'a', 1: 'b', 2: 'c'}),
2065 ("abc", 3)
2066 )
2067
2068 self.assertEqual(
2069 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2070 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2071 ("AaBbCc", 3)
2072 )
2073
2074 self.assertEqual(
2075 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2076 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2077 ("\U0010FFFFbc", 3)
2078 )
2079
2080 self.assertEqual(
2081 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2082 {0: 'a', 1: 'b', 2: ''}),
2083 ("ab", 3)
2084 )
2085
2086 self.assertRaises(UnicodeDecodeError,
2087 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2088 {0: 'a', 1: 'b'}
2089 )
2090
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002091 self.assertRaises(UnicodeDecodeError,
2092 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2093 {0: 'a', 1: 'b', 2: None}
2094 )
2095
2096 # Issue #14850
2097 self.assertRaises(UnicodeDecodeError,
2098 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2099 {0: 'a', 1: 'b', 2: '\ufffe'}
2100 )
2101
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002102 self.assertEqual(
2103 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2104 {0: 'a', 1: 'b'}),
2105 ("ab\ufffd", 3)
2106 )
2107
2108 self.assertEqual(
2109 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2110 {0: 'a', 1: 'b', 2: None}),
2111 ("ab\ufffd", 3)
2112 )
2113
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002114 # Issue #14850
2115 self.assertEqual(
2116 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2117 {0: 'a', 1: 'b', 2: '\ufffe'}),
2118 ("ab\ufffd", 3)
2119 )
2120
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002121 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002122 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2123 {0: 'a', 1: 'b'}),
2124 ("ab\\x02", 3)
2125 )
2126
2127 self.assertEqual(
2128 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2129 {0: 'a', 1: 'b', 2: None}),
2130 ("ab\\x02", 3)
2131 )
2132
2133 # Issue #14850
2134 self.assertEqual(
2135 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2136 {0: 'a', 1: 'b', 2: '\ufffe'}),
2137 ("ab\\x02", 3)
2138 )
2139
2140 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002141 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2142 {0: 'a', 1: 'b'}),
2143 ("ab", 3)
2144 )
2145
2146 self.assertEqual(
2147 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2148 {0: 'a', 1: 'b', 2: None}),
2149 ("ab", 3)
2150 )
2151
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002152 # Issue #14850
2153 self.assertEqual(
2154 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2155 {0: 'a', 1: 'b', 2: '\ufffe'}),
2156 ("ab", 3)
2157 )
2158
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002159 allbytes = bytes(range(256))
2160 self.assertEqual(
2161 codecs.charmap_decode(allbytes, "ignore", {}),
2162 ("", len(allbytes))
2163 )
2164
2165 def test_decode_with_int2int_map(self):
2166 a = ord('a')
2167 b = ord('b')
2168 c = ord('c')
2169
2170 self.assertEqual(
2171 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2172 {0: a, 1: b, 2: c}),
2173 ("abc", 3)
2174 )
2175
2176 # Issue #15379
2177 self.assertEqual(
2178 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2179 {0: 0x10FFFF, 1: b, 2: c}),
2180 ("\U0010FFFFbc", 3)
2181 )
2182
Antoine Pitroua1f76552012-09-23 20:00:04 +02002183 self.assertEqual(
2184 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2185 {0: sys.maxunicode, 1: b, 2: c}),
2186 (chr(sys.maxunicode) + "bc", 3)
2187 )
2188
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002189 self.assertRaises(TypeError,
2190 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002191 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002192 )
2193
2194 self.assertRaises(UnicodeDecodeError,
2195 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2196 {0: a, 1: b},
2197 )
2198
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002199 self.assertRaises(UnicodeDecodeError,
2200 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2201 {0: a, 1: b, 2: 0xFFFE},
2202 )
2203
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002204 self.assertEqual(
2205 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2206 {0: a, 1: b}),
2207 ("ab\ufffd", 3)
2208 )
2209
2210 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002211 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2212 {0: a, 1: b, 2: 0xFFFE}),
2213 ("ab\ufffd", 3)
2214 )
2215
2216 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002217 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2218 {0: a, 1: b}),
2219 ("ab\\x02", 3)
2220 )
2221
2222 self.assertEqual(
2223 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2224 {0: a, 1: b, 2: 0xFFFE}),
2225 ("ab\\x02", 3)
2226 )
2227
2228 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002229 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2230 {0: a, 1: b}),
2231 ("ab", 3)
2232 )
2233
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002234 self.assertEqual(
2235 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2236 {0: a, 1: b, 2: 0xFFFE}),
2237 ("ab", 3)
2238 )
2239
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002240
Thomas Wouters89f507f2006-12-13 04:49:30 +00002241class WithStmtTest(unittest.TestCase):
2242 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002243 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002244 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2245 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002246 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002247
2248 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002249 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002250 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002251 with codecs.StreamReaderWriter(f, info.streamreader,
2252 info.streamwriter, 'strict') as srw:
2253 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002254
Victor Stinnerf96418d2015-09-21 23:06:27 +02002255
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002256class TypesTest(unittest.TestCase):
2257 def test_decode_unicode(self):
2258 # Most decoders don't accept unicode input
2259 decoders = [
2260 codecs.utf_7_decode,
2261 codecs.utf_8_decode,
2262 codecs.utf_16_le_decode,
2263 codecs.utf_16_be_decode,
2264 codecs.utf_16_ex_decode,
2265 codecs.utf_32_decode,
2266 codecs.utf_32_le_decode,
2267 codecs.utf_32_be_decode,
2268 codecs.utf_32_ex_decode,
2269 codecs.latin_1_decode,
2270 codecs.ascii_decode,
2271 codecs.charmap_decode,
2272 ]
2273 if hasattr(codecs, "mbcs_decode"):
2274 decoders.append(codecs.mbcs_decode)
2275 for decoder in decoders:
2276 self.assertRaises(TypeError, decoder, "xxx")
2277
2278 def test_unicode_escape(self):
Martin Panter119e5022016-04-16 09:28:57 +00002279 # Escape-decoding a unicode string is supported and gives the same
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002280 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002281 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2282 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2283 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2284 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002285
Victor Stinnere3b47152011-12-09 20:49:49 +01002286 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2287 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002288 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2289 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002290
2291 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2292 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002293 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2294 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002295
Serhiy Storchakad6793772013-01-29 10:20:44 +02002296
2297class UnicodeEscapeTest(unittest.TestCase):
2298 def test_empty(self):
2299 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2300 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2301
2302 def test_raw_encode(self):
2303 encode = codecs.unicode_escape_encode
2304 for b in range(32, 127):
2305 if b != b'\\'[0]:
2306 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2307
2308 def test_raw_decode(self):
2309 decode = codecs.unicode_escape_decode
2310 for b in range(256):
2311 if b != b'\\'[0]:
2312 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2313
2314 def test_escape_encode(self):
2315 encode = codecs.unicode_escape_encode
2316 check = coding_checker(self, encode)
2317 check('\t', br'\t')
2318 check('\n', br'\n')
2319 check('\r', br'\r')
2320 check('\\', br'\\')
2321 for b in range(32):
2322 if chr(b) not in '\t\n\r':
2323 check(chr(b), ('\\x%02x' % b).encode())
2324 for b in range(127, 256):
2325 check(chr(b), ('\\x%02x' % b).encode())
2326 check('\u20ac', br'\u20ac')
2327 check('\U0001d120', br'\U0001d120')
2328
2329 def test_escape_decode(self):
2330 decode = codecs.unicode_escape_decode
2331 check = coding_checker(self, decode)
2332 check(b"[\\\n]", "[]")
2333 check(br'[\"]', '["]')
2334 check(br"[\']", "[']")
2335 check(br"[\\]", r"[\]")
2336 check(br"[\a]", "[\x07]")
2337 check(br"[\b]", "[\x08]")
2338 check(br"[\t]", "[\x09]")
2339 check(br"[\n]", "[\x0a]")
2340 check(br"[\v]", "[\x0b]")
2341 check(br"[\f]", "[\x0c]")
2342 check(br"[\r]", "[\x0d]")
2343 check(br"[\7]", "[\x07]")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002344 check(br"[\78]", "[\x078]")
2345 check(br"[\41]", "[!]")
2346 check(br"[\418]", "[!8]")
2347 check(br"[\101]", "[A]")
2348 check(br"[\1010]", "[A0]")
2349 check(br"[\x41]", "[A]")
2350 check(br"[\x410]", "[A0]")
2351 check(br"\u20ac", "\u20ac")
2352 check(br"\U0001d120", "\U0001d120")
R David Murray110b6fe2016-09-08 15:34:08 -04002353 for i in range(97, 123):
2354 b = bytes([i])
2355 if b not in b'abfnrtuvx':
2356 with self.assertWarns(DeprecationWarning):
2357 check(b"\\" + b, "\\" + chr(i))
2358 if b.upper() not in b'UN':
2359 with self.assertWarns(DeprecationWarning):
2360 check(b"\\" + b.upper(), "\\" + chr(i-32))
2361 with self.assertWarns(DeprecationWarning):
2362 check(br"\8", "\\8")
2363 with self.assertWarns(DeprecationWarning):
2364 check(br"\9", "\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03002365 with self.assertWarns(DeprecationWarning):
2366 check(b"\\\xfa", "\\\xfa")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002367
2368 def test_decode_errors(self):
2369 decode = codecs.unicode_escape_decode
2370 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2371 for i in range(d):
2372 self.assertRaises(UnicodeDecodeError, decode,
2373 b"\\" + c + b"0"*i)
2374 self.assertRaises(UnicodeDecodeError, decode,
2375 b"[\\" + c + b"0"*i + b"]")
2376 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2377 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2378 self.assertEqual(decode(data, "replace"),
2379 ("[\ufffd]\ufffd", len(data)))
2380 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2381 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2382 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2383
2384
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002385class RawUnicodeEscapeTest(unittest.TestCase):
2386 def test_empty(self):
2387 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2388 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2389
2390 def test_raw_encode(self):
2391 encode = codecs.raw_unicode_escape_encode
2392 for b in range(256):
2393 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2394
2395 def test_raw_decode(self):
2396 decode = codecs.raw_unicode_escape_decode
2397 for b in range(256):
2398 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2399
2400 def test_escape_encode(self):
2401 encode = codecs.raw_unicode_escape_encode
2402 check = coding_checker(self, encode)
2403 for b in range(256):
2404 if b not in b'uU':
2405 check('\\' + chr(b), b'\\' + bytes([b]))
2406 check('\u20ac', br'\u20ac')
2407 check('\U0001d120', br'\U0001d120')
2408
2409 def test_escape_decode(self):
2410 decode = codecs.raw_unicode_escape_decode
2411 check = coding_checker(self, decode)
2412 for b in range(256):
2413 if b not in b'uU':
2414 check(b'\\' + bytes([b]), '\\' + chr(b))
2415 check(br"\u20ac", "\u20ac")
2416 check(br"\U0001d120", "\U0001d120")
2417
2418 def test_decode_errors(self):
2419 decode = codecs.raw_unicode_escape_decode
2420 for c, d in (b'u', 4), (b'U', 4):
2421 for i in range(d):
2422 self.assertRaises(UnicodeDecodeError, decode,
2423 b"\\" + c + b"0"*i)
2424 self.assertRaises(UnicodeDecodeError, decode,
2425 b"[\\" + c + b"0"*i + b"]")
2426 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2427 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2428 self.assertEqual(decode(data, "replace"),
2429 ("[\ufffd]\ufffd", len(data)))
2430 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2431 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2432 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2433
2434
Berker Peksag4a72a7b2016-09-16 17:31:06 +03002435class EscapeEncodeTest(unittest.TestCase):
2436
2437 def test_escape_encode(self):
2438 tests = [
2439 (b'', (b'', 0)),
2440 (b'foobar', (b'foobar', 6)),
2441 (b'spam\0eggs', (b'spam\\x00eggs', 9)),
2442 (b'a\'b', (b"a\\'b", 3)),
2443 (b'b\\c', (b'b\\\\c', 3)),
2444 (b'c\nd', (b'c\\nd', 3)),
2445 (b'd\re', (b'd\\re', 3)),
2446 (b'f\x7fg', (b'f\\x7fg', 3)),
2447 ]
2448 for data, output in tests:
2449 with self.subTest(data=data):
2450 self.assertEqual(codecs.escape_encode(data), output)
2451 self.assertRaises(TypeError, codecs.escape_encode, 'spam')
2452 self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam'))
2453
2454
Martin v. Löwis43c57782009-05-10 08:15:24 +00002455class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002456
2457 def test_utf8(self):
2458 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002459 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002460 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002461 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002462 b"foo\x80bar")
2463 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002464 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002465 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002466 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002467 b"\xed\xb0\x80")
2468
2469 def test_ascii(self):
2470 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002471 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002472 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002473 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002474 b"foo\x80bar")
2475
2476 def test_charmap(self):
2477 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002478 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002479 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002480 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002481 b"foo\xa5bar")
2482
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002483 def test_latin1(self):
2484 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002485 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002486 b"\xe4\xeb\xef\xf6\xfc")
2487
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002488
Victor Stinner3fed0872010-05-22 02:16:27 +00002489class BomTest(unittest.TestCase):
2490 def test_seek0(self):
2491 data = "1234567890"
2492 tests = ("utf-16",
2493 "utf-16-le",
2494 "utf-16-be",
2495 "utf-32",
2496 "utf-32-le",
2497 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002498 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002499 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002500 # Check if the BOM is written only once
2501 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002502 f.write(data)
2503 f.write(data)
2504 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002505 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002506 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002507 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002508
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002509 # Check that the BOM is written after a seek(0)
2510 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2511 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002512 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002513 f.seek(0)
2514 f.write(data)
2515 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002516 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002517
2518 # (StreamWriter) Check that the BOM is written after a seek(0)
2519 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002520 f.writer.write(data[0])
2521 self.assertNotEqual(f.writer.tell(), 0)
2522 f.writer.seek(0)
2523 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002524 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002525 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002526
Victor Stinner05010702011-05-27 16:50:40 +02002527 # Check that the BOM is not written after a seek() at a position
2528 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002529 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2530 f.write(data)
2531 f.seek(f.tell())
2532 f.write(data)
2533 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002534 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002535
Victor Stinner05010702011-05-27 16:50:40 +02002536 # (StreamWriter) Check that the BOM is not written after a seek()
2537 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002538 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002539 f.writer.write(data)
2540 f.writer.seek(f.writer.tell())
2541 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002542 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002543 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002544
Victor Stinner3fed0872010-05-22 02:16:27 +00002545
Georg Brandl02524622010-12-02 18:06:51 +00002546bytes_transform_encodings = [
2547 "base64_codec",
2548 "uu_codec",
2549 "quopri_codec",
2550 "hex_codec",
2551]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002552
2553transform_aliases = {
2554 "base64_codec": ["base64", "base_64"],
2555 "uu_codec": ["uu"],
2556 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2557 "hex_codec": ["hex"],
2558 "rot_13": ["rot13"],
2559}
2560
Georg Brandl02524622010-12-02 18:06:51 +00002561try:
2562 import zlib
2563except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002564 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002565else:
2566 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002567 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002568try:
2569 import bz2
2570except ImportError:
2571 pass
2572else:
2573 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002574 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002575
Victor Stinnerf96418d2015-09-21 23:06:27 +02002576
Georg Brandl02524622010-12-02 18:06:51 +00002577class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002578
Georg Brandl02524622010-12-02 18:06:51 +00002579 def test_basics(self):
2580 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002581 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002582 with self.subTest(encoding=encoding):
2583 # generic codecs interface
2584 (o, size) = codecs.getencoder(encoding)(binput)
2585 self.assertEqual(size, len(binput))
2586 (i, size) = codecs.getdecoder(encoding)(o)
2587 self.assertEqual(size, len(o))
2588 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002589
Georg Brandl02524622010-12-02 18:06:51 +00002590 def test_read(self):
2591 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002592 with self.subTest(encoding=encoding):
2593 sin = codecs.encode(b"\x80", encoding)
2594 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2595 sout = reader.read()
2596 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002597
2598 def test_readline(self):
2599 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002600 with self.subTest(encoding=encoding):
2601 sin = codecs.encode(b"\x80", encoding)
2602 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2603 sout = reader.readline()
2604 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002605
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002606 def test_buffer_api_usage(self):
2607 # We check all the transform codecs accept memoryview input
2608 # for encoding and decoding
2609 # and also that they roundtrip correctly
2610 original = b"12345\x80"
2611 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002612 with self.subTest(encoding=encoding):
2613 data = original
2614 view = memoryview(data)
2615 data = codecs.encode(data, encoding)
2616 view_encoded = codecs.encode(view, encoding)
2617 self.assertEqual(view_encoded, data)
2618 view = memoryview(data)
2619 data = codecs.decode(data, encoding)
2620 self.assertEqual(data, original)
2621 view_decoded = codecs.decode(view, encoding)
2622 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002623
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002624 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002625 # Check binary -> binary codecs give a good error for str input
2626 bad_input = "bad input type"
2627 for encoding in bytes_transform_encodings:
2628 with self.subTest(encoding=encoding):
R David Murray44b548d2016-09-08 13:59:53 -04002629 fmt = (r"{!r} is not a text encoding; "
2630 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002631 msg = fmt.format(encoding)
2632 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002633 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002634 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002635
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002636 def test_text_to_binary_blacklists_text_transforms(self):
2637 # Check str.encode gives a good error message for str -> str codecs
2638 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002639 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002640 with self.assertRaisesRegex(LookupError, msg):
2641 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002642
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002643 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002644 # Check bytes.decode and bytearray.decode give a good error
2645 # message for binary -> binary codecs
2646 data = b"encode first to ensure we meet any format restrictions"
2647 for encoding in bytes_transform_encodings:
2648 with self.subTest(encoding=encoding):
2649 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002650 fmt = (r"{!r} is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002651 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002652 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002653 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002654 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002655 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002656 bytearray(encoded_data).decode(encoding)
2657
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002658 def test_binary_to_text_blacklists_text_transforms(self):
2659 # Check str -> str codec gives a good error for binary input
2660 for bad_input in (b"immutable", bytearray(b"mutable")):
2661 with self.subTest(bad_input=bad_input):
2662 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002663 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002664 with self.assertRaisesRegex(LookupError, msg) as failure:
2665 bad_input.decode("rot_13")
2666 self.assertIsNone(failure.exception.__cause__)
2667
Zachary Wareefa2e042013-12-30 14:54:11 -06002668 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002669 def test_custom_zlib_error_is_wrapped(self):
2670 # Check zlib codec gives a good error for malformed input
2671 msg = "^decoding with 'zlib_codec' codec failed"
2672 with self.assertRaisesRegex(Exception, msg) as failure:
2673 codecs.decode(b"hello", "zlib_codec")
2674 self.assertIsInstance(failure.exception.__cause__,
2675 type(failure.exception))
2676
2677 def test_custom_hex_error_is_wrapped(self):
2678 # Check hex codec gives a good error for malformed input
2679 msg = "^decoding with 'hex_codec' codec failed"
2680 with self.assertRaisesRegex(Exception, msg) as failure:
2681 codecs.decode(b"hello", "hex_codec")
2682 self.assertIsInstance(failure.exception.__cause__,
2683 type(failure.exception))
2684
2685 # Unfortunately, the bz2 module throws OSError, which the codec
2686 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002687
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002688 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2689 def test_aliases(self):
2690 for codec_name, aliases in transform_aliases.items():
2691 expected_name = codecs.lookup(codec_name).name
2692 for alias in aliases:
2693 with self.subTest(alias=alias):
2694 info = codecs.lookup(alias)
2695 self.assertEqual(info.name, expected_name)
2696
Martin Panter06171bd2015-09-12 00:34:28 +00002697 def test_quopri_stateless(self):
2698 # Should encode with quotetabs=True
2699 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2700 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2701 # But should still support unescaped tabs and spaces
2702 unescaped = b"space tab eol\n"
2703 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2704
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002705 def test_uu_invalid(self):
2706 # Missing "begin" line
2707 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2708
Nick Coghlan8b097b42013-11-13 23:49:21 +10002709
2710# The codec system tries to wrap exceptions in order to ensure the error
2711# mentions the operation being performed and the codec involved. We
2712# currently *only* want this to happen for relatively stateless
2713# exceptions, where the only significant information they contain is their
2714# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002715
2716# Use a local codec registry to avoid appearing to leak objects when
Martin Panter119e5022016-04-16 09:28:57 +00002717# registering multiple search functions
Nick Coghlan4e553e22013-11-16 00:35:34 +10002718_TEST_CODECS = {}
2719
2720def _get_test_codec(codec_name):
2721 return _TEST_CODECS.get(codec_name)
2722codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2723
Nick Coghlan8fad1672014-09-15 23:50:44 +12002724try:
2725 # Issue #22166: Also need to clear the internal cache in CPython
2726 from _codecs import _forget_codec
2727except ImportError:
2728 def _forget_codec(codec_name):
2729 pass
2730
2731
Nick Coghlan8b097b42013-11-13 23:49:21 +10002732class ExceptionChainingTest(unittest.TestCase):
2733
2734 def setUp(self):
2735 # There's no way to unregister a codec search function, so we just
2736 # ensure we render this one fairly harmless after the test
2737 # case finishes by using the test case repr as the codec name
2738 # The codecs module normalizes codec names, although this doesn't
2739 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002740 # We also make sure we use a truly unique id for the custom codec
2741 # to avoid issues with the codec cache when running these tests
2742 # multiple times (e.g. when hunting for refleaks)
2743 unique_id = repr(self) + str(id(self))
2744 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2745
2746 # We store the object to raise on the instance because of a bad
2747 # interaction between the codec caching (which means we can't
2748 # recreate the codec entry) and regrtest refleak hunting (which
2749 # runs the same test instance multiple times). This means we
2750 # need to ensure the codecs call back in to the instance to find
2751 # out which exception to raise rather than binding them in a
2752 # closure to an object that may change on the next run
2753 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002754
Nick Coghlan4e553e22013-11-16 00:35:34 +10002755 def tearDown(self):
2756 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002757 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2758 encodings._cache.pop(self.codec_name, None)
2759 try:
2760 _forget_codec(self.codec_name)
2761 except KeyError:
2762 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002763
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002764 def set_codec(self, encode, decode):
2765 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002766 name=self.codec_name)
2767 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002768
2769 @contextlib.contextmanager
2770 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002771 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002772 operation, self.codec_name, exc_type.__name__, msg)
2773 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2774 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002775 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002776 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002777
2778 def raise_obj(self, *args, **kwds):
2779 # Helper to dynamically change the object raised by a test codec
2780 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002781
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002782 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002783 self.obj_to_raise = obj_to_raise
2784 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002785 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002786 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002787 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002788 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002789 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002790 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002791 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002792 codecs.decode(b"bytes input", self.codec_name)
2793
2794 def test_raise_by_type(self):
2795 self.check_wrapped(RuntimeError, "")
2796
2797 def test_raise_by_value(self):
2798 msg = "This should be wrapped"
2799 self.check_wrapped(RuntimeError(msg), msg)
2800
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002801 def test_raise_grandchild_subclass_exact_size(self):
2802 msg = "This should be wrapped"
2803 class MyRuntimeError(RuntimeError):
2804 __slots__ = ()
2805 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2806
2807 def test_raise_subclass_with_weakref_support(self):
2808 msg = "This should be wrapped"
2809 class MyRuntimeError(RuntimeError):
2810 pass
2811 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2812
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002813 def check_not_wrapped(self, obj_to_raise, msg):
2814 def raise_obj(*args, **kwds):
2815 raise obj_to_raise
2816 self.set_codec(raise_obj, raise_obj)
2817 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002818 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002819 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002820 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002821 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002822 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002823 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002824 codecs.decode(b"bytes input", self.codec_name)
2825
2826 def test_init_override_is_not_wrapped(self):
2827 class CustomInit(RuntimeError):
2828 def __init__(self):
2829 pass
2830 self.check_not_wrapped(CustomInit, "")
2831
2832 def test_new_override_is_not_wrapped(self):
2833 class CustomNew(RuntimeError):
2834 def __new__(cls):
2835 return super().__new__(cls)
2836 self.check_not_wrapped(CustomNew, "")
2837
2838 def test_instance_attribute_is_not_wrapped(self):
2839 msg = "This should NOT be wrapped"
2840 exc = RuntimeError(msg)
2841 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002842 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002843
2844 def test_non_str_arg_is_not_wrapped(self):
2845 self.check_not_wrapped(RuntimeError(1), "1")
2846
2847 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002848 msg_re = r"^\('a', 'b', 'c'\)$"
2849 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002850
2851 # http://bugs.python.org/issue19609
2852 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002853 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002854 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002855 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002856 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002857 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002858 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002859 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002860 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002861 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002862 codecs.decode(b"bytes input", self.codec_name)
2863
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002864 def test_unflagged_non_text_codec_handling(self):
2865 # The stdlib non-text codecs are now marked so they're
2866 # pre-emptively skipped by the text model related methods
2867 # However, third party codecs won't be flagged, so we still make
2868 # sure the case where an inappropriate output type is produced is
2869 # handled appropriately
2870 def encode_to_str(*args, **kwds):
2871 return "not bytes!", 0
2872 def decode_to_bytes(*args, **kwds):
2873 return b"not str!", 0
2874 self.set_codec(encode_to_str, decode_to_bytes)
2875 # No input or output type checks on the codecs module functions
2876 encoded = codecs.encode(None, self.codec_name)
2877 self.assertEqual(encoded, "not bytes!")
2878 decoded = codecs.decode(None, self.codec_name)
2879 self.assertEqual(decoded, b"not str!")
2880 # Text model methods should complain
2881 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
R David Murray44b548d2016-09-08 13:59:53 -04002882 r"use codecs.encode\(\) to encode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002883 msg = fmt.format(self.codec_name)
2884 with self.assertRaisesRegex(TypeError, msg):
2885 "str_input".encode(self.codec_name)
2886 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
R David Murray44b548d2016-09-08 13:59:53 -04002887 r"use codecs.decode\(\) to decode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002888 msg = fmt.format(self.codec_name)
2889 with self.assertRaisesRegex(TypeError, msg):
2890 b"bytes input".decode(self.codec_name)
2891
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002892
Georg Brandl02524622010-12-02 18:06:51 +00002893
Victor Stinner62be4fb2011-10-18 21:46:37 +02002894@unittest.skipUnless(sys.platform == 'win32',
2895 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002896class CodePageTest(unittest.TestCase):
2897 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002898
Victor Stinner3a50e702011-10-18 21:21:00 +02002899 def test_invalid_code_page(self):
2900 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2901 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002902 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2903 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002904
2905 def test_code_page_name(self):
2906 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2907 codecs.code_page_encode, 932, '\xff')
2908 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002909 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002910 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002911 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002912
2913 def check_decode(self, cp, tests):
2914 for raw, errors, expected in tests:
2915 if expected is not None:
2916 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01002917 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002918 except UnicodeDecodeError as err:
2919 self.fail('Unable to decode %a from "cp%s" with '
2920 'errors=%r: %s' % (raw, cp, errors, err))
2921 self.assertEqual(decoded[0], expected,
2922 '%a.decode("cp%s", %r)=%a != %a'
2923 % (raw, cp, errors, decoded[0], expected))
2924 # assert 0 <= decoded[1] <= len(raw)
2925 self.assertGreaterEqual(decoded[1], 0)
2926 self.assertLessEqual(decoded[1], len(raw))
2927 else:
2928 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01002929 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002930
2931 def check_encode(self, cp, tests):
2932 for text, errors, expected in tests:
2933 if expected is not None:
2934 try:
2935 encoded = codecs.code_page_encode(cp, text, errors)
2936 except UnicodeEncodeError as err:
2937 self.fail('Unable to encode %a to "cp%s" with '
2938 'errors=%r: %s' % (text, cp, errors, err))
2939 self.assertEqual(encoded[0], expected,
2940 '%a.encode("cp%s", %r)=%a != %a'
2941 % (text, cp, errors, encoded[0], expected))
2942 self.assertEqual(encoded[1], len(text))
2943 else:
2944 self.assertRaises(UnicodeEncodeError,
2945 codecs.code_page_encode, cp, text, errors)
2946
2947 def test_cp932(self):
2948 self.check_encode(932, (
2949 ('abc', 'strict', b'abc'),
2950 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002951 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002952 ('\xff', 'strict', None),
2953 ('[\xff]', 'ignore', b'[]'),
2954 ('[\xff]', 'replace', b'[y]'),
2955 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002956 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02002957 ('[\xff]', 'namereplace',
2958 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002959 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002960 ('\udcff', 'strict', None),
2961 ('[\udcff]', 'surrogateescape', b'[\xff]'),
2962 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002963 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002964 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002965 (b'abc', 'strict', 'abc'),
2966 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2967 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002968 (b'[\xff]', 'strict', None),
2969 (b'[\xff]', 'ignore', '[]'),
2970 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002971 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002972 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002973 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002974 (b'\x81\x00abc', 'strict', None),
2975 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002976 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01002977 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002978 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002979
2980 def test_cp1252(self):
2981 self.check_encode(1252, (
2982 ('abc', 'strict', b'abc'),
2983 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2984 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002985 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002986 ('\u0141', 'strict', None),
2987 ('\u0141', 'ignore', b''),
2988 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002989 ('\udc98', 'surrogateescape', b'\x98'),
2990 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002991 ))
2992 self.check_decode(1252, (
2993 (b'abc', 'strict', 'abc'),
2994 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2995 (b'\xff', 'strict', '\xff'),
2996 ))
2997
2998 def test_cp_utf7(self):
2999 cp = 65000
3000 self.check_encode(cp, (
3001 ('abc', 'strict', b'abc'),
3002 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
3003 ('\U0010ffff', 'strict', b'+2//f/w-'),
3004 ('\udc80', 'strict', b'+3IA-'),
3005 ('\ufffd', 'strict', b'+//0-'),
3006 ))
3007 self.check_decode(cp, (
3008 (b'abc', 'strict', 'abc'),
3009 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3010 (b'+2//f/w-', 'strict', '\U0010ffff'),
3011 (b'+3IA-', 'strict', '\udc80'),
3012 (b'+//0-', 'strict', '\ufffd'),
3013 # invalid bytes
3014 (b'[+/]', 'strict', '[]'),
3015 (b'[\xff]', 'strict', '[\xff]'),
3016 ))
3017
Victor Stinner3a50e702011-10-18 21:21:00 +02003018 def test_multibyte_encoding(self):
3019 self.check_decode(932, (
3020 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3021 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3022 ))
3023 self.check_decode(self.CP_UTF8, (
3024 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3025 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3026 ))
Steve Dowerf5aba582016-09-06 19:42:27 -07003027 self.check_encode(self.CP_UTF8, (
3028 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3029 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3030 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003031
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003032 def test_code_page_decode_flags(self):
3033 # Issue #36312: For some code pages (e.g. UTF-7) flags for
3034 # MultiByteToWideChar() must be set to 0.
Paul Monson62dfd7d2019-04-25 11:36:45 -07003035 if support.verbose:
3036 sys.stdout.write('\n')
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003037 for cp in (50220, 50221, 50222, 50225, 50227, 50229,
3038 *range(57002, 57011+1), 65000):
Paul Monson62dfd7d2019-04-25 11:36:45 -07003039 # On small versions of Windows like Windows IoT
3040 # not all codepages are present.
3041 # A missing codepage causes an OSError exception
3042 # so check for the codepage before decoding
3043 if is_code_page_present(cp):
3044 self.assertEqual(codecs.code_page_decode(cp, b'abc'), ('abc', 3), f'cp{cp}')
3045 else:
3046 if support.verbose:
3047 print(f" skipping cp={cp}")
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003048 self.assertEqual(codecs.code_page_decode(42, b'abc'),
3049 ('\uf061\uf062\uf063', 3))
3050
Victor Stinner3a50e702011-10-18 21:21:00 +02003051 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003052 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3053 self.assertEqual(decoded, ('', 0))
3054
Victor Stinner3a50e702011-10-18 21:21:00 +02003055 decoded = codecs.code_page_decode(932,
3056 b'\xe9\x80\xe9', 'strict',
3057 False)
3058 self.assertEqual(decoded, ('\u9a3e', 2))
3059
3060 decoded = codecs.code_page_decode(932,
3061 b'\xe9\x80\xe9\x80', 'strict',
3062 False)
3063 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3064
3065 decoded = codecs.code_page_decode(932,
3066 b'abc', 'strict',
3067 False)
3068 self.assertEqual(decoded, ('abc', 3))
3069
Steve Dowerf5aba582016-09-06 19:42:27 -07003070 def test_mbcs_alias(self):
3071 # Check that looking up our 'default' codepage will return
3072 # mbcs when we don't have a more specific one available
Victor Stinner91106cd2017-12-13 12:29:09 +01003073 with mock.patch('_winapi.GetACP', return_value=123):
Steve Dowerf5aba582016-09-06 19:42:27 -07003074 codec = codecs.lookup('cp123')
3075 self.assertEqual(codec.name, 'mbcs')
Steve Dowerf5aba582016-09-06 19:42:27 -07003076
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003077 @support.bigmemtest(size=2**31, memuse=7, dry_run=False)
Miss Islington (bot)f93c15a2019-08-21 16:53:56 -07003078 def test_large_input(self, size):
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003079 # Test input longer than INT_MAX.
3080 # Input should contain undecodable bytes before and after
3081 # the INT_MAX limit.
Miss Islington (bot)f93c15a2019-08-21 16:53:56 -07003082 encoded = (b'01234567' * ((size//8)-1) +
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003083 b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff')
Miss Islington (bot)f93c15a2019-08-21 16:53:56 -07003084 self.assertEqual(len(encoded), size+2)
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003085 decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True)
3086 self.assertEqual(decoded[1], len(encoded))
3087 del encoded
3088 self.assertEqual(len(decoded[0]), decoded[1])
3089 self.assertEqual(decoded[0][:10], '0123456701')
3090 self.assertEqual(decoded[0][-20:],
3091 '6701234567'
3092 '\udc85\udc86\udcea\udceb\udcec'
3093 '\udcef\udcfc\udcfd\udcfe\udcff')
3094
Miss Islington (bot)f93c15a2019-08-21 16:53:56 -07003095 @support.bigmemtest(size=2**31, memuse=6, dry_run=False)
3096 def test_large_utf8_input(self, size):
3097 # Test input longer than INT_MAX.
3098 # Input should contain a decodable multi-byte character
3099 # surrounding INT_MAX
3100 encoded = (b'0123456\xed\x84\x80' * (size//8))
3101 self.assertEqual(len(encoded), size // 8 * 10)
3102 decoded = codecs.code_page_decode(65001, encoded, 'ignore', True)
3103 self.assertEqual(decoded[1], len(encoded))
3104 del encoded
3105 self.assertEqual(len(decoded[0]), size)
3106 self.assertEqual(decoded[0][:10], '0123456\ud10001')
3107 self.assertEqual(decoded[0][-11:], '56\ud1000123456\ud100')
3108
Victor Stinner3a50e702011-10-18 21:21:00 +02003109
Victor Stinnerf96418d2015-09-21 23:06:27 +02003110class ASCIITest(unittest.TestCase):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003111 def test_encode(self):
3112 self.assertEqual('abc123'.encode('ascii'), b'abc123')
3113
3114 def test_encode_error(self):
3115 for data, error_handler, expected in (
3116 ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3117 ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3118 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003119 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3120 b'[\\x80\\xff\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003121 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3122 ):
3123 with self.subTest(data=data, error_handler=error_handler,
3124 expected=expected):
3125 self.assertEqual(data.encode('ascii', error_handler),
3126 expected)
3127
3128 def test_encode_surrogateescape_error(self):
3129 with self.assertRaises(UnicodeEncodeError):
3130 # the first character can be decoded, but not the second
3131 '\udc80\xff'.encode('ascii', 'surrogateescape')
3132
Victor Stinnerf96418d2015-09-21 23:06:27 +02003133 def test_decode(self):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003134 self.assertEqual(b'abc'.decode('ascii'), 'abc')
3135
3136 def test_decode_error(self):
Victor Stinnerf96418d2015-09-21 23:06:27 +02003137 for data, error_handler, expected in (
3138 (b'[\x80\xff]', 'ignore', '[]'),
3139 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3140 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3141 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3142 ):
3143 with self.subTest(data=data, error_handler=error_handler,
3144 expected=expected):
3145 self.assertEqual(data.decode('ascii', error_handler),
3146 expected)
3147
3148
Victor Stinnerc3713e92015-09-29 12:32:13 +02003149class Latin1Test(unittest.TestCase):
3150 def test_encode(self):
3151 for data, expected in (
3152 ('abc', b'abc'),
3153 ('\x80\xe9\xff', b'\x80\xe9\xff'),
3154 ):
3155 with self.subTest(data=data, expected=expected):
3156 self.assertEqual(data.encode('latin1'), expected)
3157
3158 def test_encode_errors(self):
3159 for data, error_handler, expected in (
3160 ('[\u20ac\udc80]', 'ignore', b'[]'),
3161 ('[\u20ac\udc80]', 'replace', b'[??]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003162 ('[\u20ac\U000abcde]', 'backslashreplace',
3163 b'[\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003164 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3165 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3166 ):
3167 with self.subTest(data=data, error_handler=error_handler,
3168 expected=expected):
3169 self.assertEqual(data.encode('latin1', error_handler),
3170 expected)
3171
3172 def test_encode_surrogateescape_error(self):
3173 with self.assertRaises(UnicodeEncodeError):
3174 # the first character can be decoded, but not the second
3175 '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3176
3177 def test_decode(self):
3178 for data, expected in (
3179 (b'abc', 'abc'),
3180 (b'[\x80\xff]', '[\x80\xff]'),
3181 ):
3182 with self.subTest(data=data, expected=expected):
3183 self.assertEqual(data.decode('latin1'), expected)
3184
3185
Jelle Zijlstrab3be4072019-05-22 08:18:26 -07003186class StreamRecoderTest(unittest.TestCase):
3187 def test_writelines(self):
3188 bio = io.BytesIO()
3189 codec = codecs.lookup('ascii')
3190 sr = codecs.StreamRecoder(bio, codec.encode, codec.decode,
3191 encodings.ascii.StreamReader, encodings.ascii.StreamWriter)
3192 sr.writelines([b'a', b'b'])
3193 self.assertEqual(bio.getvalue(), b'ab')
3194
3195 def test_write(self):
3196 bio = io.BytesIO()
3197 codec = codecs.lookup('latin1')
3198 # Recode from Latin-1 to utf-8.
3199 sr = codecs.StreamRecoder(bio, codec.encode, codec.decode,
3200 encodings.utf_8.StreamReader, encodings.utf_8.StreamWriter)
3201
3202 text = 'àñé'
3203 sr.write(text.encode('latin1'))
3204 self.assertEqual(bio.getvalue(), text.encode('utf-8'))
3205
Ammar Askara6ec1ce2019-05-31 12:44:01 -07003206 def test_seeking_read(self):
3207 bio = io.BytesIO('line1\nline2\nline3\n'.encode('utf-16-le'))
3208 sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le')
3209
3210 self.assertEqual(sr.readline(), b'line1\n')
3211 sr.seek(0)
3212 self.assertEqual(sr.readline(), b'line1\n')
3213 self.assertEqual(sr.readline(), b'line2\n')
3214 self.assertEqual(sr.readline(), b'line3\n')
3215 self.assertEqual(sr.readline(), b'')
3216
3217 def test_seeking_write(self):
3218 bio = io.BytesIO('123456789\n'.encode('utf-16-le'))
3219 sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le')
3220
3221 # Test that seek() only resets its internal buffer when offset
3222 # and whence are zero.
3223 sr.seek(2)
3224 sr.write(b'\nabc\n')
3225 self.assertEqual(sr.readline(), b'789\n')
3226 sr.seek(0)
3227 self.assertEqual(sr.readline(), b'1\n')
3228 self.assertEqual(sr.readline(), b'abc\n')
3229 self.assertEqual(sr.readline(), b'789\n')
3230
Jelle Zijlstrab3be4072019-05-22 08:18:26 -07003231
Victor Stinner3d4226a2018-08-29 22:21:32 +02003232@unittest.skipIf(_testcapi is None, 'need _testcapi module')
3233class LocaleCodecTest(unittest.TestCase):
3234 """
3235 Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex().
3236 """
3237 ENCODING = sys.getfilesystemencoding()
3238 STRINGS = ("ascii", "ulatin1:\xa7\xe9",
3239 "u255:\xff",
3240 "UCS:\xe9\u20ac\U0010ffff",
3241 "surrogates:\uDC80\uDCFF")
3242 BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff")
3243 SURROGATES = "\uDC80\uDCFF"
3244
3245 def encode(self, text, errors="strict"):
3246 return _testcapi.EncodeLocaleEx(text, 0, errors)
3247
3248 def check_encode_strings(self, errors):
3249 for text in self.STRINGS:
3250 with self.subTest(text=text):
3251 try:
3252 expected = text.encode(self.ENCODING, errors)
3253 except UnicodeEncodeError:
3254 with self.assertRaises(RuntimeError) as cm:
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003255 self.encode(text, errors)
Victor Stinner3d4226a2018-08-29 22:21:32 +02003256 errmsg = str(cm.exception)
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003257 self.assertRegex(errmsg, r"encode error: pos=[0-9]+, reason=")
Victor Stinner3d4226a2018-08-29 22:21:32 +02003258 else:
3259 encoded = self.encode(text, errors)
3260 self.assertEqual(encoded, expected)
3261
3262 def test_encode_strict(self):
3263 self.check_encode_strings("strict")
3264
3265 def test_encode_surrogateescape(self):
3266 self.check_encode_strings("surrogateescape")
3267
3268 def test_encode_surrogatepass(self):
3269 try:
3270 self.encode('', 'surrogatepass')
3271 except ValueError as exc:
3272 if str(exc) == 'unsupported error handler':
3273 self.skipTest(f"{self.ENCODING!r} encoder doesn't support "
3274 f"surrogatepass error handler")
3275 else:
3276 raise
3277
3278 self.check_encode_strings("surrogatepass")
3279
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003280 def test_encode_unsupported_error_handler(self):
3281 with self.assertRaises(ValueError) as cm:
3282 self.encode('', 'backslashreplace')
3283 self.assertEqual(str(cm.exception), 'unsupported error handler')
3284
Victor Stinner3d4226a2018-08-29 22:21:32 +02003285 def decode(self, encoded, errors="strict"):
3286 return _testcapi.DecodeLocaleEx(encoded, 0, errors)
3287
3288 def check_decode_strings(self, errors):
3289 is_utf8 = (self.ENCODING == "utf-8")
3290 if is_utf8:
3291 encode_errors = 'surrogateescape'
3292 else:
3293 encode_errors = 'strict'
3294
3295 strings = list(self.BYTES_STRINGS)
3296 for text in self.STRINGS:
3297 try:
3298 encoded = text.encode(self.ENCODING, encode_errors)
3299 if encoded not in strings:
3300 strings.append(encoded)
3301 except UnicodeEncodeError:
3302 encoded = None
3303
3304 if is_utf8:
3305 encoded2 = text.encode(self.ENCODING, 'surrogatepass')
3306 if encoded2 != encoded:
3307 strings.append(encoded2)
3308
3309 for encoded in strings:
3310 with self.subTest(encoded=encoded):
3311 try:
3312 expected = encoded.decode(self.ENCODING, errors)
3313 except UnicodeDecodeError:
3314 with self.assertRaises(RuntimeError) as cm:
3315 self.decode(encoded, errors)
3316 errmsg = str(cm.exception)
3317 self.assertTrue(errmsg.startswith("decode error: "), errmsg)
3318 else:
3319 decoded = self.decode(encoded, errors)
3320 self.assertEqual(decoded, expected)
3321
3322 def test_decode_strict(self):
3323 self.check_decode_strings("strict")
3324
3325 def test_decode_surrogateescape(self):
3326 self.check_decode_strings("surrogateescape")
3327
3328 def test_decode_surrogatepass(self):
3329 try:
3330 self.decode(b'', 'surrogatepass')
3331 except ValueError as exc:
3332 if str(exc) == 'unsupported error handler':
3333 self.skipTest(f"{self.ENCODING!r} decoder doesn't support "
3334 f"surrogatepass error handler")
3335 else:
3336 raise
3337
3338 self.check_decode_strings("surrogatepass")
3339
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003340 def test_decode_unsupported_error_handler(self):
3341 with self.assertRaises(ValueError) as cm:
3342 self.decode(b'', 'backslashreplace')
3343 self.assertEqual(str(cm.exception), 'unsupported error handler')
3344
Victor Stinner3d4226a2018-08-29 22:21:32 +02003345
Miss Islington (bot)b6ef8f22019-09-09 09:12:01 -07003346class Rot13Test(unittest.TestCase):
3347 """Test the educational ROT-13 codec."""
3348 def test_encode(self):
3349 ciphertext = codecs.encode("Caesar liked ciphers", 'rot-13')
3350 self.assertEqual(ciphertext, 'Pnrfne yvxrq pvcuref')
3351
3352 def test_decode(self):
3353 plaintext = codecs.decode('Rg gh, Oehgr?', 'rot-13')
3354 self.assertEqual(plaintext, 'Et tu, Brute?')
3355
3356 def test_incremental_encode(self):
3357 encoder = codecs.getincrementalencoder('rot-13')()
3358 ciphertext = encoder.encode('ABBA nag Cheryl Baker')
3359 self.assertEqual(ciphertext, 'NOON ant Purely Onxre')
3360
3361 def test_incremental_decode(self):
3362 decoder = codecs.getincrementaldecoder('rot-13')()
3363 plaintext = decoder.decode('terra Ares envy tha')
3364 self.assertEqual(plaintext, 'green Nerf rail gun')
3365
3366
3367class Rot13UtilTest(unittest.TestCase):
3368 """Test the ROT-13 codec via rot13 function,
3369 i.e. the user has done something like:
3370 $ echo "Hello World" | python -m encodings.rot_13
3371 """
3372 def test_rot13_func(self):
3373 infile = io.StringIO('Gb or, be abg gb or, gung vf gur dhrfgvba')
3374 outfile = io.StringIO()
3375 encodings.rot_13.rot13(infile, outfile)
3376 outfile.seek(0)
3377 plain_text = outfile.read()
3378 self.assertEqual(
3379 plain_text,
3380 'To be, or not to be, that is the question')
3381
3382
Fred Drake2e2be372001-09-20 21:33:42 +00003383if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003384 unittest.main()