blob: 8c14f5981d0bcea2bcdb390b6d64174d6128c6ef [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
Nick Coghlanc72e4e62013-11-22 22:39:36 +10007import encodings
Victor Stinner91106cd2017-12-13 12:29:09 +01008from unittest import mock
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Antoine Pitrou00b2c862011-10-05 13:01:41 +020012try:
Victor Stinner3d4226a2018-08-29 22:21:32 +020013 import _testcapi
14except ImportError as exc:
15 _testcapi = None
16
17try:
Antoine Pitrou00b2c862011-10-05 13:01:41 +020018 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Paul Monson62dfd7d2019-04-25 11:36:45 -070030# On small versions of Windows like Windows IoT or Windows Nano Server not all codepages are present
31def is_code_page_present(cp):
32 from ctypes import POINTER, WINFUNCTYPE, windll, WinError, Structure, WinDLL
33 from ctypes.wintypes import BOOL, UINT, BYTE, WCHAR, UINT, DWORD
34
35 MAX_LEADBYTES = 12 # 5 ranges, 2 bytes ea., 0 term.
36 MAX_DEFAULTCHAR = 2 # single or double byte
37 MAX_PATH = 260
38 class CPINFOEXW(ctypes.Structure):
39 _fields_ = [("MaxCharSize", UINT),
40 ("DefaultChar", BYTE*MAX_DEFAULTCHAR),
41 ("LeadByte", BYTE*MAX_LEADBYTES),
42 ("UnicodeDefaultChar", WCHAR),
43 ("CodePage", UINT),
44 ("CodePageName", WCHAR*MAX_PATH)]
45
46 prototype = WINFUNCTYPE(BOOL, UINT, DWORD, POINTER(CPINFOEXW))
47 GetCPInfoEx = prototype(("GetCPInfoExW", WinDLL("kernel32")))
48 info = CPINFOEXW()
49 return GetCPInfoEx(cp, 0, info)
Victor Stinnerf96418d2015-09-21 23:06:27 +020050
Walter Dörwald69652032004-09-07 20:24:22 +000051class Queue(object):
52 """
53 queue: write bytes at one end, read bytes from the other end
54 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000055 def __init__(self, buffer):
56 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000057
58 def write(self, chars):
59 self._buffer += chars
60
61 def read(self, size=-1):
62 if size<0:
63 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000064 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000065 return s
66 else:
67 s = self._buffer[:size]
68 self._buffer = self._buffer[size:]
69 return s
70
Victor Stinnerf96418d2015-09-21 23:06:27 +020071
Walter Dörwald3abcb012007-04-16 22:10:50 +000072class MixInCheckStateHandling:
73 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000074 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000075 d = codecs.getincrementaldecoder(encoding)()
76 part1 = d.decode(s[:i])
77 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000078 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000079 # Check that the condition stated in the documentation for
80 # IncrementalDecoder.getstate() holds
81 if not state[1]:
82 # reset decoder to the default state without anything buffered
83 d.setstate((state[0][:0], 0))
84 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000085 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000086 # The decoder must return to the same state
87 self.assertEqual(state, d.getstate())
88 # Create a new decoder and set it to the state
89 # we extracted from the old one
90 d = codecs.getincrementaldecoder(encoding)()
91 d.setstate(state)
92 part2 = d.decode(s[i:], True)
93 self.assertEqual(u, part1+part2)
94
95 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000096 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000097 d = codecs.getincrementalencoder(encoding)()
98 part1 = d.encode(u[:i])
99 state = d.getstate()
100 d = codecs.getincrementalencoder(encoding)()
101 d.setstate(state)
102 part2 = d.encode(u[i:], True)
103 self.assertEqual(s, part1+part2)
104
Victor Stinnerf96418d2015-09-21 23:06:27 +0200105
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200106class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000107 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +0000108 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000109 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +0000110 # the StreamReader and check that the results equal the appropriate
111 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000112 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200113 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000114 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000115 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000116 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +0000117 result += r.read()
118 self.assertEqual(result, partialresult)
119 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000120 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000121 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +0000122
Martin Panter7462b6492015-11-02 03:37:02 +0000123 # do the check again, this time using an incremental decoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000124 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000125 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000126 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000127 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000128 self.assertEqual(result, partialresult)
129 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000130 self.assertEqual(d.decode(b"", True), "")
131 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000132
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000133 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000134 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000135 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000136 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000137 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000138 self.assertEqual(result, partialresult)
139 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000140 self.assertEqual(d.decode(b"", True), "")
141 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000142
143 # check iterdecode()
144 encoded = input.encode(self.encoding)
145 self.assertEqual(
146 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000147 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000148 )
149
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000150 def test_readline(self):
151 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000152 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000153 return codecs.getreader(self.encoding)(stream)
154
Walter Dörwaldca199432006-03-06 22:39:12 +0000155 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200156 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000157 lines = []
158 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000159 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000160 if not line:
161 break
162 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000163 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000164
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000165 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
166 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
167 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000168 self.assertEqual(readalllines(s, True), sexpected)
169 self.assertEqual(readalllines(s, False), sexpectednoends)
170 self.assertEqual(readalllines(s, True, 10), sexpected)
171 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000172
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200173 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000174 # Test long lines (multiple calls to read() in readline())
175 vw = []
176 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200177 for (i, lineend) in enumerate(lineends):
178 vw.append((i*200+200)*"\u3042" + lineend)
179 vwo.append((i*200+200)*"\u3042")
180 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
181 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000182
183 # Test lines where the first read might end with \r, so the
184 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000185 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200186 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000187 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000188 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000189 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000190 self.assertEqual(
191 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000192 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000193 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200194 self.assertEqual(
195 reader.readline(keepends=True),
196 "xxx\n",
197 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000198 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000199 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000200 self.assertEqual(
201 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000202 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000203 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200204 self.assertEqual(
205 reader.readline(keepends=False),
206 "xxx",
207 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000208
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200209 def test_mixed_readline_and_read(self):
210 lines = ["Humpty Dumpty sat on a wall,\n",
211 "Humpty Dumpty had a great fall.\r\n",
212 "All the king's horses and all the king's men\r",
213 "Couldn't put Humpty together again."]
214 data = ''.join(lines)
215 def getreader():
216 stream = io.BytesIO(data.encode(self.encoding))
217 return codecs.getreader(self.encoding)(stream)
218
219 # Issue #8260: Test readline() followed by read()
220 f = getreader()
221 self.assertEqual(f.readline(), lines[0])
222 self.assertEqual(f.read(), ''.join(lines[1:]))
223 self.assertEqual(f.read(), '')
224
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200225 # Issue #32110: Test readline() followed by read(n)
226 f = getreader()
227 self.assertEqual(f.readline(), lines[0])
228 self.assertEqual(f.read(1), lines[1][0])
229 self.assertEqual(f.read(0), '')
230 self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100])
231
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200232 # Issue #16636: Test readline() followed by readlines()
233 f = getreader()
234 self.assertEqual(f.readline(), lines[0])
235 self.assertEqual(f.readlines(), lines[1:])
236 self.assertEqual(f.read(), '')
237
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200238 # Test read(n) followed by read()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200239 f = getreader()
240 self.assertEqual(f.read(size=40, chars=5), data[:5])
241 self.assertEqual(f.read(), data[5:])
242 self.assertEqual(f.read(), '')
243
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200244 # Issue #32110: Test read(n) followed by read(n)
245 f = getreader()
246 self.assertEqual(f.read(size=40, chars=5), data[:5])
247 self.assertEqual(f.read(1), data[5])
248 self.assertEqual(f.read(0), '')
249 self.assertEqual(f.read(100), data[6:106])
250
251 # Issue #12446: Test read(n) followed by readlines()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200252 f = getreader()
253 self.assertEqual(f.read(size=40, chars=5), data[:5])
254 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
255 self.assertEqual(f.read(), '')
256
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000257 def test_bug1175396(self):
258 s = [
259 '<%!--===================================================\r\n',
260 ' BLOG index page: show recent articles,\r\n',
261 ' today\'s articles, or articles of a specific date.\r\n',
262 '========================================================--%>\r\n',
263 '<%@inputencoding="ISO-8859-1"%>\r\n',
264 '<%@pagetemplate=TEMPLATE.y%>\r\n',
265 '<%@import=import frog.util, frog%>\r\n',
266 '<%@import=import frog.objects%>\r\n',
267 '<%@import=from frog.storageerrors import StorageError%>\r\n',
268 '<%\r\n',
269 '\r\n',
270 'import logging\r\n',
271 'log=logging.getLogger("Snakelets.logger")\r\n',
272 '\r\n',
273 '\r\n',
274 'user=self.SessionCtx.user\r\n',
275 'storageEngine=self.SessionCtx.storageEngine\r\n',
276 '\r\n',
277 '\r\n',
278 'def readArticlesFromDate(date, count=None):\r\n',
279 ' entryids=storageEngine.listBlogEntries(date)\r\n',
280 ' entryids.reverse() # descending\r\n',
281 ' if count:\r\n',
282 ' entryids=entryids[:count]\r\n',
283 ' try:\r\n',
284 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
285 ' except StorageError,x:\r\n',
286 ' log.error("Error loading articles: "+str(x))\r\n',
287 ' self.abort("cannot load articles")\r\n',
288 '\r\n',
289 'showdate=None\r\n',
290 '\r\n',
291 'arg=self.Request.getArg()\r\n',
292 'if arg=="today":\r\n',
293 ' #-------------------- TODAY\'S ARTICLES\r\n',
294 ' self.write("<h2>Today\'s articles</h2>")\r\n',
295 ' showdate = frog.util.isodatestr() \r\n',
296 ' entries = readArticlesFromDate(showdate)\r\n',
297 'elif arg=="active":\r\n',
298 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
299 ' self.Yredirect("active.y")\r\n',
300 'elif arg=="login":\r\n',
301 ' #-------------------- LOGIN PAGE redirect\r\n',
302 ' self.Yredirect("login.y")\r\n',
303 'elif arg=="date":\r\n',
304 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
305 ' showdate = self.Request.getParameter("date")\r\n',
306 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
307 ' entries = readArticlesFromDate(showdate)\r\n',
308 'else:\r\n',
309 ' #-------------------- RECENT ARTICLES\r\n',
310 ' self.write("<h2>Recent articles</h2>")\r\n',
311 ' dates=storageEngine.listBlogEntryDates()\r\n',
312 ' if dates:\r\n',
313 ' entries=[]\r\n',
314 ' SHOWAMOUNT=10\r\n',
315 ' for showdate in dates:\r\n',
316 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
317 ' if len(entries)>=SHOWAMOUNT:\r\n',
318 ' break\r\n',
319 ' \r\n',
320 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000321 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200322 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000323 for (i, line) in enumerate(reader):
324 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000325
326 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000327 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200328 writer = codecs.getwriter(self.encoding)(q)
329 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000330
331 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000332 writer.write("foo\r")
333 self.assertEqual(reader.readline(keepends=False), "foo")
334 writer.write("\nbar\r")
335 self.assertEqual(reader.readline(keepends=False), "")
336 self.assertEqual(reader.readline(keepends=False), "bar")
337 writer.write("baz")
338 self.assertEqual(reader.readline(keepends=False), "baz")
339 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000340
341 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000342 writer.write("foo\r")
343 self.assertEqual(reader.readline(keepends=True), "foo\r")
344 writer.write("\nbar\r")
345 self.assertEqual(reader.readline(keepends=True), "\n")
346 self.assertEqual(reader.readline(keepends=True), "bar\r")
347 writer.write("baz")
348 self.assertEqual(reader.readline(keepends=True), "baz")
349 self.assertEqual(reader.readline(keepends=True), "")
350 writer.write("foo\r\n")
351 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000352
Walter Dörwald9fa09462005-01-10 12:01:39 +0000353 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000354 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
355 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
356 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000357
358 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000359 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200360 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000361 self.assertEqual(reader.readline(), s1)
362 self.assertEqual(reader.readline(), s2)
363 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000364 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000365
366 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000367 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
368 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
369 s3 = "stillokay:bbbbxx\r\n"
370 s4 = "broken!!!!badbad\r\n"
371 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000372
373 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000374 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200375 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000376 self.assertEqual(reader.readline(), s1)
377 self.assertEqual(reader.readline(), s2)
378 self.assertEqual(reader.readline(), s3)
379 self.assertEqual(reader.readline(), s4)
380 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000381 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000382
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200383 ill_formed_sequence_replace = "\ufffd"
384
385 def test_lone_surrogates(self):
386 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
387 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
388 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200389 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
390 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200391 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
392 "[&#56448;]".encode(self.encoding))
393 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
394 "[]".encode(self.encoding))
395 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
396 "[?]".encode(self.encoding))
397
Victor Stinner01ada392015-10-01 21:54:51 +0200398 # sequential surrogate characters
399 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
400 "[]".encode(self.encoding))
401 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
402 "[??]".encode(self.encoding))
403
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200404 bom = "".encode(self.encoding)
405 for before, after in [("\U00010fff", "A"), ("[", "]"),
406 ("A", "\U00010fff")]:
407 before_sequence = before.encode(self.encoding)[len(bom):]
408 after_sequence = after.encode(self.encoding)[len(bom):]
409 test_string = before + "\uDC80" + after
410 test_sequence = (bom + before_sequence +
411 self.ill_formed_sequence + after_sequence)
412 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
413 self.encoding)
414 self.assertEqual(test_string.encode(self.encoding,
415 "surrogatepass"),
416 test_sequence)
417 self.assertEqual(test_sequence.decode(self.encoding,
418 "surrogatepass"),
419 test_string)
420 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
421 before + after)
422 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
423 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200424 backslashreplace = ''.join('\\x%02x' % b
425 for b in self.ill_formed_sequence)
426 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
427 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200428
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +0200429 def test_incremental_surrogatepass(self):
430 # Test incremental decoder for surrogatepass handler:
431 # see issue #24214
432 data = '\uD901'.encode(self.encoding, 'surrogatepass')
433 for i in range(1, len(data)):
434 dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
435 self.assertEqual(dec.decode(data[:i]), '')
436 self.assertEqual(dec.decode(data[i:], True), '\uD901')
437
Victor Stinnerf96418d2015-09-21 23:06:27 +0200438
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200439class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000440 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200441 if sys.byteorder == 'little':
442 ill_formed_sequence = b"\x80\xdc\x00\x00"
443 else:
444 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000445
446 spamle = (b'\xff\xfe\x00\x00'
447 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
448 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
449 spambe = (b'\x00\x00\xfe\xff'
450 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
451 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
452
453 def test_only_one_bom(self):
454 _,_,reader,writer = codecs.lookup(self.encoding)
455 # encode some stream
456 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200457 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000458 f.write("spam")
459 f.write("spam")
460 d = s.getvalue()
461 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000462 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000463 # try to read it back
464 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200465 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000466 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000467
468 def test_badbom(self):
469 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200470 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000471 self.assertRaises(UnicodeError, f.read)
472
473 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200474 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000475 self.assertRaises(UnicodeError, f.read)
476
477 def test_partial(self):
478 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200479 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000480 [
481 "", # first byte of BOM read
482 "", # second byte of BOM read
483 "", # third byte of BOM read
484 "", # fourth byte of BOM read => byteorder known
485 "",
486 "",
487 "",
488 "\x00",
489 "\x00",
490 "\x00",
491 "\x00",
492 "\x00\xff",
493 "\x00\xff",
494 "\x00\xff",
495 "\x00\xff",
496 "\x00\xff\u0100",
497 "\x00\xff\u0100",
498 "\x00\xff\u0100",
499 "\x00\xff\u0100",
500 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200501 "\x00\xff\u0100\uffff",
502 "\x00\xff\u0100\uffff",
503 "\x00\xff\u0100\uffff",
504 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000505 ]
506 )
507
Georg Brandl791f4e12009-09-17 11:41:24 +0000508 def test_handlers(self):
509 self.assertEqual(('\ufffd', 1),
510 codecs.utf_32_decode(b'\x01', 'replace', True))
511 self.assertEqual(('', 1),
512 codecs.utf_32_decode(b'\x01', 'ignore', True))
513
Walter Dörwald41980ca2007-08-16 21:55:45 +0000514 def test_errors(self):
515 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
516 b"\xff", "strict", True)
517
518 def test_decoder_state(self):
519 self.check_state_handling_decode(self.encoding,
520 "spamspam", self.spamle)
521 self.check_state_handling_decode(self.encoding,
522 "spamspam", self.spambe)
523
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000524 def test_issue8941(self):
525 # Issue #8941: insufficient result allocation when decoding into
526 # surrogate pairs on UCS-2 builds.
527 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
528 self.assertEqual('\U00010000' * 1024,
529 codecs.utf_32_decode(encoded_le)[0])
530 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
531 self.assertEqual('\U00010000' * 1024,
532 codecs.utf_32_decode(encoded_be)[0])
533
Victor Stinnerf96418d2015-09-21 23:06:27 +0200534
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200535class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000536 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200537 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000538
539 def test_partial(self):
540 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200541 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000542 [
543 "",
544 "",
545 "",
546 "\x00",
547 "\x00",
548 "\x00",
549 "\x00",
550 "\x00\xff",
551 "\x00\xff",
552 "\x00\xff",
553 "\x00\xff",
554 "\x00\xff\u0100",
555 "\x00\xff\u0100",
556 "\x00\xff\u0100",
557 "\x00\xff\u0100",
558 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200559 "\x00\xff\u0100\uffff",
560 "\x00\xff\u0100\uffff",
561 "\x00\xff\u0100\uffff",
562 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000563 ]
564 )
565
566 def test_simple(self):
567 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
568
569 def test_errors(self):
570 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
571 b"\xff", "strict", True)
572
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000573 def test_issue8941(self):
574 # Issue #8941: insufficient result allocation when decoding into
575 # surrogate pairs on UCS-2 builds.
576 encoded = b'\x00\x00\x01\x00' * 1024
577 self.assertEqual('\U00010000' * 1024,
578 codecs.utf_32_le_decode(encoded)[0])
579
Victor Stinnerf96418d2015-09-21 23:06:27 +0200580
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200581class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000582 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200583 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000584
585 def test_partial(self):
586 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200587 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000588 [
589 "",
590 "",
591 "",
592 "\x00",
593 "\x00",
594 "\x00",
595 "\x00",
596 "\x00\xff",
597 "\x00\xff",
598 "\x00\xff",
599 "\x00\xff",
600 "\x00\xff\u0100",
601 "\x00\xff\u0100",
602 "\x00\xff\u0100",
603 "\x00\xff\u0100",
604 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200605 "\x00\xff\u0100\uffff",
606 "\x00\xff\u0100\uffff",
607 "\x00\xff\u0100\uffff",
608 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000609 ]
610 )
611
612 def test_simple(self):
613 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
614
615 def test_errors(self):
616 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
617 b"\xff", "strict", True)
618
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000619 def test_issue8941(self):
620 # Issue #8941: insufficient result allocation when decoding into
621 # surrogate pairs on UCS-2 builds.
622 encoded = b'\x00\x01\x00\x00' * 1024
623 self.assertEqual('\U00010000' * 1024,
624 codecs.utf_32_be_decode(encoded)[0])
625
626
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200627class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000628 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200629 if sys.byteorder == 'little':
630 ill_formed_sequence = b"\x80\xdc"
631 else:
632 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000633
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000634 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
635 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000636
637 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000638 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000639 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000640 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200641 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000642 f.write("spam")
643 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000644 d = s.getvalue()
645 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000646 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000647 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000648 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200649 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000650 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000651
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000652 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000653 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200654 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000655 self.assertRaises(UnicodeError, f.read)
656
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000657 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200658 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000659 self.assertRaises(UnicodeError, f.read)
660
Walter Dörwald69652032004-09-07 20:24:22 +0000661 def test_partial(self):
662 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200663 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000664 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000665 "", # first byte of BOM read
666 "", # second byte of BOM read => byteorder known
667 "",
668 "\x00",
669 "\x00",
670 "\x00\xff",
671 "\x00\xff",
672 "\x00\xff\u0100",
673 "\x00\xff\u0100",
674 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200675 "\x00\xff\u0100\uffff",
676 "\x00\xff\u0100\uffff",
677 "\x00\xff\u0100\uffff",
678 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000679 ]
680 )
681
Georg Brandl791f4e12009-09-17 11:41:24 +0000682 def test_handlers(self):
683 self.assertEqual(('\ufffd', 1),
684 codecs.utf_16_decode(b'\x01', 'replace', True))
685 self.assertEqual(('', 1),
686 codecs.utf_16_decode(b'\x01', 'ignore', True))
687
Walter Dörwalde22d3392005-11-17 08:52:34 +0000688 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000689 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000690 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000691
692 def test_decoder_state(self):
693 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000694 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000695 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000696 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000697
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000698 def test_bug691291(self):
699 # Files are always opened in binary mode, even if no binary mode was
700 # specified. This means that no automatic conversion of '\n' is done
701 # on reading and writing.
702 s1 = 'Hello\r\nworld\r\n'
703
704 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200705 self.addCleanup(support.unlink, support.TESTFN)
706 with open(support.TESTFN, 'wb') as fp:
707 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200708 with support.check_warnings(('', DeprecationWarning)):
709 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
710 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200711 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000712
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200713class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000714 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200715 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000716
717 def test_partial(self):
718 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200719 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000720 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000721 "",
722 "\x00",
723 "\x00",
724 "\x00\xff",
725 "\x00\xff",
726 "\x00\xff\u0100",
727 "\x00\xff\u0100",
728 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200729 "\x00\xff\u0100\uffff",
730 "\x00\xff\u0100\uffff",
731 "\x00\xff\u0100\uffff",
732 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000733 ]
734 )
735
Walter Dörwalde22d3392005-11-17 08:52:34 +0000736 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200737 tests = [
738 (b'\xff', '\ufffd'),
739 (b'A\x00Z', 'A\ufffd'),
740 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
741 (b'\x00\xd8', '\ufffd'),
742 (b'\x00\xd8A', '\ufffd'),
743 (b'\x00\xd8A\x00', '\ufffdA'),
744 (b'\x00\xdcA\x00', '\ufffdA'),
745 ]
746 for raw, expected in tests:
747 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
748 raw, 'strict', True)
749 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000750
Victor Stinner53a9dd72010-12-08 22:25:45 +0000751 def test_nonbmp(self):
752 self.assertEqual("\U00010203".encode(self.encoding),
753 b'\x00\xd8\x03\xde')
754 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
755 "\U00010203")
756
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200757class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000758 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200759 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000760
761 def test_partial(self):
762 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200763 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000764 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000765 "",
766 "\x00",
767 "\x00",
768 "\x00\xff",
769 "\x00\xff",
770 "\x00\xff\u0100",
771 "\x00\xff\u0100",
772 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200773 "\x00\xff\u0100\uffff",
774 "\x00\xff\u0100\uffff",
775 "\x00\xff\u0100\uffff",
776 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000777 ]
778 )
779
Walter Dörwalde22d3392005-11-17 08:52:34 +0000780 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200781 tests = [
782 (b'\xff', '\ufffd'),
783 (b'\x00A\xff', 'A\ufffd'),
784 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
785 (b'\xd8\x00', '\ufffd'),
786 (b'\xd8\x00\xdc', '\ufffd'),
787 (b'\xd8\x00\x00A', '\ufffdA'),
788 (b'\xdc\x00\x00A', '\ufffdA'),
789 ]
790 for raw, expected in tests:
791 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
792 raw, 'strict', True)
793 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000794
Victor Stinner53a9dd72010-12-08 22:25:45 +0000795 def test_nonbmp(self):
796 self.assertEqual("\U00010203".encode(self.encoding),
797 b'\xd8\x00\xde\x03')
798 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
799 "\U00010203")
800
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200801class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000802 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200803 ill_formed_sequence = b"\xed\xb2\x80"
804 ill_formed_sequence_replace = "\ufffd" * 3
Victor Stinner01ada392015-10-01 21:54:51 +0200805 BOM = b''
Walter Dörwald69652032004-09-07 20:24:22 +0000806
807 def test_partial(self):
808 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200809 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000810 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000811 "\x00",
812 "\x00",
813 "\x00\xff",
814 "\x00\xff",
815 "\x00\xff\u07ff",
816 "\x00\xff\u07ff",
817 "\x00\xff\u07ff",
818 "\x00\xff\u07ff\u0800",
819 "\x00\xff\u07ff\u0800",
820 "\x00\xff\u07ff\u0800",
821 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200822 "\x00\xff\u07ff\u0800\uffff",
823 "\x00\xff\u07ff\u0800\uffff",
824 "\x00\xff\u07ff\u0800\uffff",
825 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000826 ]
827 )
828
Walter Dörwald3abcb012007-04-16 22:10:50 +0000829 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000830 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000831 self.check_state_handling_decode(self.encoding,
832 u, u.encode(self.encoding))
833
Victor Stinner1d65d912015-10-05 13:43:50 +0200834 def test_decode_error(self):
835 for data, error_handler, expected in (
836 (b'[\x80\xff]', 'ignore', '[]'),
837 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
838 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
839 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
840 ):
841 with self.subTest(data=data, error_handler=error_handler,
842 expected=expected):
843 self.assertEqual(data.decode(self.encoding, error_handler),
844 expected)
845
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000846 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200847 super().test_lone_surrogates()
848 # not sure if this is making sense for
849 # UTF-16 and UTF-32
Victor Stinner01ada392015-10-01 21:54:51 +0200850 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
851 self.BOM + b'[\x80]')
852
853 with self.assertRaises(UnicodeEncodeError) as cm:
854 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
855 exc = cm.exception
856 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000857
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000858 def test_surrogatepass_handler(self):
Victor Stinner01ada392015-10-01 21:54:51 +0200859 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
860 self.BOM + b"abc\xed\xa0\x80def")
861 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
862 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
863 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
864 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
865
866 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
Ezio Melottib3aedd42010-11-20 19:04:17 +0000867 "abc\ud800def")
Victor Stinner01ada392015-10-01 21:54:51 +0200868 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200869 "\U00010fff\uD800")
Victor Stinner01ada392015-10-01 21:54:51 +0200870
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000871 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700872 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200873 b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200874 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200875 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000876
Victor Stinnerf96418d2015-09-21 23:06:27 +0200877
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200878class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000879 encoding = "utf-7"
880
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300881 def test_ascii(self):
882 # Set D (directly encoded characters)
883 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
884 'abcdefghijklmnopqrstuvwxyz'
885 '0123456789'
886 '\'(),-./:?')
887 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
888 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
889 # Set O (optional direct characters)
890 set_o = ' !"#$%&*;<=>@[]^_`{|}'
891 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
892 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
893 # +
894 self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
895 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
896 # White spaces
897 ws = ' \t\n\r'
898 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
899 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
900 # Other ASCII characters
901 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
902 set(set_d + set_o + '+' + ws)))
903 self.assertEqual(other_ascii.encode(self.encoding),
904 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
905 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
906
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000907 def test_partial(self):
908 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200909 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000910 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200911 'a',
912 'a',
913 'a+',
914 'a+-',
915 'a+-b',
916 'a+-b',
917 'a+-b',
918 'a+-b',
919 'a+-b',
920 'a+-b\x00',
921 'a+-b\x00c',
922 'a+-b\x00c',
923 'a+-b\x00c',
924 'a+-b\x00c',
925 'a+-b\x00c',
926 'a+-b\x00c\x80',
927 'a+-b\x00c\x80d',
928 'a+-b\x00c\x80d',
929 'a+-b\x00c\x80d',
930 'a+-b\x00c\x80d',
931 'a+-b\x00c\x80d',
932 'a+-b\x00c\x80d\u0100',
933 'a+-b\x00c\x80d\u0100e',
934 'a+-b\x00c\x80d\u0100e',
935 'a+-b\x00c\x80d\u0100e',
936 'a+-b\x00c\x80d\u0100e',
937 'a+-b\x00c\x80d\u0100e',
938 'a+-b\x00c\x80d\u0100e',
939 'a+-b\x00c\x80d\u0100e',
940 'a+-b\x00c\x80d\u0100e',
941 'a+-b\x00c\x80d\u0100e\U00010000',
942 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000943 ]
944 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000945
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300946 def test_errors(self):
947 tests = [
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300948 (b'\xffb', '\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300949 (b'a\xffb', 'a\ufffdb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300950 (b'a\xff\xffb', 'a\ufffd\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300951 (b'a+IK', 'a\ufffd'),
952 (b'a+IK-b', 'a\ufffdb'),
953 (b'a+IK,b', 'a\ufffdb'),
954 (b'a+IKx', 'a\u20ac\ufffd'),
955 (b'a+IKx-b', 'a\u20ac\ufffdb'),
956 (b'a+IKwgr', 'a\u20ac\ufffd'),
957 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
958 (b'a+IKwgr,', 'a\u20ac\ufffd'),
959 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
960 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
961 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
962 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
963 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
964 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
965 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300966 (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
967 (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
Zackery Spytze349bf22018-08-18 22:43:38 -0600968 (b'a+@b', 'a\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300969 ]
970 for raw, expected in tests:
971 with self.subTest(raw=raw):
972 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
973 raw, 'strict', True)
974 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
975
976 def test_nonbmp(self):
977 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
978 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
979 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300980 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
981 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
982 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
983 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
984 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
985 b'+IKwgrNgB3KA-')
986 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
987 '\u20ac\u20ac\U000104A0')
988 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
989 '\u20ac\u20ac\U000104A0')
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300990
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300991 def test_lone_surrogates(self):
992 tests = [
993 (b'a+2AE-b', 'a\ud801b'),
994 (b'a+2AE\xffb', 'a\ufffdb'),
995 (b'a+2AE', 'a\ufffd'),
996 (b'a+2AEA-b', 'a\ufffdb'),
997 (b'a+2AH-b', 'a\ufffdb'),
998 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
999 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1000 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1001 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1002 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1003 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1004 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1005 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1006 ]
1007 for raw, expected in tests:
1008 with self.subTest(raw=raw):
1009 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001010
1011
Walter Dörwalde22d3392005-11-17 08:52:34 +00001012class UTF16ExTest(unittest.TestCase):
1013
1014 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001015 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001016
1017 def test_bad_args(self):
1018 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1019
1020class ReadBufferTest(unittest.TestCase):
1021
1022 def test_array(self):
1023 import array
1024 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001025 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +00001026 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001027 )
1028
1029 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +00001030 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +00001031
1032 def test_bad_args(self):
1033 self.assertRaises(TypeError, codecs.readbuffer_encode)
1034 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1035
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001036class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001037 encoding = "utf-8-sig"
Victor Stinner01ada392015-10-01 21:54:51 +02001038 BOM = codecs.BOM_UTF8
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001039
1040 def test_partial(self):
1041 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001042 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001043 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001044 "",
1045 "",
1046 "", # First BOM has been read and skipped
1047 "",
1048 "",
1049 "\ufeff", # Second BOM has been read and emitted
1050 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001051 "\ufeff\x00", # First byte of encoded "\xff" read
1052 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1053 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1054 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001055 "\ufeff\x00\xff\u07ff",
1056 "\ufeff\x00\xff\u07ff",
1057 "\ufeff\x00\xff\u07ff\u0800",
1058 "\ufeff\x00\xff\u07ff\u0800",
1059 "\ufeff\x00\xff\u07ff\u0800",
1060 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001061 "\ufeff\x00\xff\u07ff\u0800\uffff",
1062 "\ufeff\x00\xff\u07ff\u0800\uffff",
1063 "\ufeff\x00\xff\u07ff\u0800\uffff",
1064 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001065 ]
1066 )
1067
Thomas Wouters89f507f2006-12-13 04:49:30 +00001068 def test_bug1601501(self):
1069 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001070 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001071
Walter Dörwald3abcb012007-04-16 22:10:50 +00001072 def test_bom(self):
1073 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001074 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001075 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1076
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001077 def test_stream_bom(self):
1078 unistring = "ABC\u00A1\u2200XYZ"
1079 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1080
1081 reader = codecs.getreader("utf-8-sig")
1082 for sizehint in [None] + list(range(1, 11)) + \
1083 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001084 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001085 ostream = io.StringIO()
1086 while 1:
1087 if sizehint is not None:
1088 data = istream.read(sizehint)
1089 else:
1090 data = istream.read()
1091
1092 if not data:
1093 break
1094 ostream.write(data)
1095
1096 got = ostream.getvalue()
1097 self.assertEqual(got, unistring)
1098
1099 def test_stream_bare(self):
1100 unistring = "ABC\u00A1\u2200XYZ"
1101 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1102
1103 reader = codecs.getreader("utf-8-sig")
1104 for sizehint in [None] + list(range(1, 11)) + \
1105 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001106 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001107 ostream = io.StringIO()
1108 while 1:
1109 if sizehint is not None:
1110 data = istream.read(sizehint)
1111 else:
1112 data = istream.read()
1113
1114 if not data:
1115 break
1116 ostream.write(data)
1117
1118 got = ostream.getvalue()
1119 self.assertEqual(got, unistring)
1120
1121class EscapeDecodeTest(unittest.TestCase):
1122 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001123 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001124 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001125
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001126 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001127 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001128 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001129 b = bytes([b])
1130 if b != b'\\':
1131 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001132
1133 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001134 decode = codecs.escape_decode
1135 check = coding_checker(self, decode)
1136 check(b"[\\\n]", b"[]")
1137 check(br'[\"]', b'["]')
1138 check(br"[\']", b"[']")
R David Murray110b6fe2016-09-08 15:34:08 -04001139 check(br"[\\]", b"[\\]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001140 check(br"[\a]", b"[\x07]")
1141 check(br"[\b]", b"[\x08]")
1142 check(br"[\t]", b"[\x09]")
1143 check(br"[\n]", b"[\x0a]")
1144 check(br"[\v]", b"[\x0b]")
1145 check(br"[\f]", b"[\x0c]")
1146 check(br"[\r]", b"[\x0d]")
1147 check(br"[\7]", b"[\x07]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001148 check(br"[\78]", b"[\x078]")
1149 check(br"[\41]", b"[!]")
1150 check(br"[\418]", b"[!8]")
1151 check(br"[\101]", b"[A]")
1152 check(br"[\1010]", b"[A0]")
1153 check(br"[\501]", b"[A]")
1154 check(br"[\x41]", b"[A]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001155 check(br"[\x410]", b"[A0]")
R David Murray110b6fe2016-09-08 15:34:08 -04001156 for i in range(97, 123):
1157 b = bytes([i])
1158 if b not in b'abfnrtvx':
1159 with self.assertWarns(DeprecationWarning):
1160 check(b"\\" + b, b"\\" + b)
1161 with self.assertWarns(DeprecationWarning):
1162 check(b"\\" + b.upper(), b"\\" + b.upper())
1163 with self.assertWarns(DeprecationWarning):
1164 check(br"\8", b"\\8")
1165 with self.assertWarns(DeprecationWarning):
1166 check(br"\9", b"\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03001167 with self.assertWarns(DeprecationWarning):
1168 check(b"\\\xfa", b"\\\xfa")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001169
1170 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001171 decode = codecs.escape_decode
1172 self.assertRaises(ValueError, decode, br"\x")
1173 self.assertRaises(ValueError, decode, br"[\x]")
1174 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1175 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1176 self.assertRaises(ValueError, decode, br"\x0")
1177 self.assertRaises(ValueError, decode, br"[\x0]")
1178 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1179 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001180
Victor Stinnerf96418d2015-09-21 23:06:27 +02001181
Martin v. Löwis2548c732003-04-18 10:39:54 +00001182# From RFC 3492
1183punycode_testcases = [
1184 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001185 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1186 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001187 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001188 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001189 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001190 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001191 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001192 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001193 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001194 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001195 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1196 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1197 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001198 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001199 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001200 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1201 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1202 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001203 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001204 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001205 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001206 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1207 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1208 "\u0939\u0948\u0902",
1209 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001210
1211 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001212 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001213 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1214 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001215
1216 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001217 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1218 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1219 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001220 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1221 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001222
1223 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001224 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1225 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1226 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1227 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001228 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001229
1230 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001231 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1232 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1233 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1234 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1235 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001236 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001237
1238 # (K) Vietnamese:
1239 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1240 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001241 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1242 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1243 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1244 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001245 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001246
Martin v. Löwis2548c732003-04-18 10:39:54 +00001247 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001248 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001249 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001250
Martin v. Löwis2548c732003-04-18 10:39:54 +00001251 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001252 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1253 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1254 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001255 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001256
1257 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001258 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1259 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1260 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001261 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001262
1263 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001264 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001265 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001266
1267 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001268 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1269 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001270 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001271
1272 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001273 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001274 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001275
1276 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001277 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001278 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001279
1280 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001281 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1282 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001283 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001284 ]
1285
1286for i in punycode_testcases:
1287 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001288 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001289
Victor Stinnerf96418d2015-09-21 23:06:27 +02001290
Martin v. Löwis2548c732003-04-18 10:39:54 +00001291class PunycodeTest(unittest.TestCase):
1292 def test_encode(self):
1293 for uni, puny in punycode_testcases:
1294 # Need to convert both strings to lower case, since
1295 # some of the extended encodings use upper case, but our
1296 # code produces only lower case. Converting just puny to
1297 # lower is also insufficient, since some of the input characters
1298 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001299 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001300 str(uni.encode("punycode"), "ascii").lower(),
1301 str(puny, "ascii").lower()
1302 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001303
1304 def test_decode(self):
1305 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001306 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001307 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001308 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001309
Victor Stinnerf96418d2015-09-21 23:06:27 +02001310
Martin v. Löwis2548c732003-04-18 10:39:54 +00001311# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1312nameprep_tests = [
1313 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001314 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1315 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1316 b'\xb8\x8f\xef\xbb\xbf',
1317 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001318 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001319 (b'CAFE',
1320 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001321 # 3.3 Case folding 8bit U+00DF (german sharp s).
1322 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001323 (b'\xc3\x9f',
1324 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001325 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001326 (b'\xc4\xb0',
1327 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001328 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001329 (b'\xc5\x83\xcd\xba',
1330 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001331 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1332 # XXX: skip this as it fails in UCS-2 mode
1333 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1334 # 'telc\xe2\x88\x95kg\xcf\x83'),
1335 (None, None),
1336 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001337 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1338 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001339 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001340 (b'\xe1\xbe\xb7',
1341 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001342 # 3.9 Self-reverting case folding U+01F0 and normalization.
1343 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001344 (b'\xc7\xb0',
1345 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001346 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001347 (b'\xce\x90',
1348 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001349 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001350 (b'\xce\xb0',
1351 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001352 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001353 (b'\xe1\xba\x96',
1354 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001355 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001356 (b'\xe1\xbd\x96',
1357 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001358 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001359 (b' ',
1360 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001361 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001362 (b'\xc2\xa0',
1363 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001364 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001365 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001366 None),
1367 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001368 (b'\xe2\x80\x80',
1369 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001370 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001371 (b'\xe2\x80\x8b',
1372 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001373 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001374 (b'\xe3\x80\x80',
1375 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001376 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001377 (b'\x10\x7f',
1378 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001379 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001380 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001381 None),
1382 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001383 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001384 None),
1385 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001386 (b'\xef\xbb\xbf',
1387 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001388 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001389 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001390 None),
1391 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001392 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001393 None),
1394 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001395 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001396 None),
1397 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001398 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001399 None),
1400 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001401 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001402 None),
1403 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001404 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001405 None),
1406 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001407 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001408 None),
1409 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001410 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001411 None),
1412 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001413 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001414 None),
1415 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001416 (b'\xcd\x81',
1417 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001418 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001419 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001420 None),
1421 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001422 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001423 None),
1424 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001425 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001426 None),
1427 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001428 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001429 None),
1430 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001431 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001432 None),
1433 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001434 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001435 None),
1436 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001437 (b'foo\xef\xb9\xb6bar',
1438 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001439 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001440 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001441 None),
1442 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001443 (b'\xd8\xa71\xd8\xa8',
1444 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001445 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001446 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001447 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001448 # None),
1449 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001450 # 3.44 Larger test (shrinking).
1451 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001452 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1453 b'\xaa\xce\xb0\xe2\x80\x80',
1454 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001455 # 3.45 Larger test (expanding).
1456 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001457 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1458 b'\x80',
1459 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1460 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1461 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001462 ]
1463
1464
1465class NameprepTest(unittest.TestCase):
1466 def test_nameprep(self):
1467 from encodings.idna import nameprep
1468 for pos, (orig, prepped) in enumerate(nameprep_tests):
1469 if orig is None:
1470 # Skipped
1471 continue
1472 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001473 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001474 if prepped is None:
1475 # Input contains prohibited characters
1476 self.assertRaises(UnicodeError, nameprep, orig)
1477 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001478 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001479 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001480 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001481 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001482 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001483
Victor Stinnerf96418d2015-09-21 23:06:27 +02001484
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001485class IDNACodecTest(unittest.TestCase):
1486 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001487 self.assertEqual(str(b"python.org", "idna"), "python.org")
1488 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1489 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1490 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001491
1492 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001493 self.assertEqual("python.org".encode("idna"), b"python.org")
1494 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1495 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1496 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001497
Martin v. Löwis8b595142005-08-25 11:03:38 +00001498 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001499 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001500 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001501 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001502
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001503 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001504 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001505 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001506 "python.org"
1507 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001508 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001509 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001510 "python.org."
1511 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001512 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001513 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001514 "pyth\xf6n.org."
1515 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001516 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001517 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001518 "pyth\xf6n.org."
1519 )
1520
1521 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001522 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1523 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1524 self.assertEqual(decoder.decode(b"rg"), "")
1525 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001526
1527 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001528 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1529 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1530 self.assertEqual(decoder.decode(b"rg."), "org.")
1531 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001532
1533 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001534 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001535 b"".join(codecs.iterencode("python.org", "idna")),
1536 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001537 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001538 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001539 b"".join(codecs.iterencode("python.org.", "idna")),
1540 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001541 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001542 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001543 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1544 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001545 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001546 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001547 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1548 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001549 )
1550
1551 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001552 self.assertEqual(encoder.encode("\xe4x"), b"")
1553 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1554 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001555
1556 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001557 self.assertEqual(encoder.encode("\xe4x"), b"")
1558 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1559 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001560
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001561 def test_errors(self):
1562 """Only supports "strict" error handler"""
1563 "python.org".encode("idna", "strict")
1564 b"python.org".decode("idna", "strict")
1565 for errors in ("ignore", "replace", "backslashreplace",
1566 "surrogateescape"):
1567 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1568 self.assertRaises(Exception,
1569 b"python.org".decode, "idna", errors)
1570
Victor Stinnerf96418d2015-09-21 23:06:27 +02001571
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001572class CodecsModuleTest(unittest.TestCase):
1573
1574 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001575 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1576 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001577 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001578 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001579 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001580
Victor Stinnera57dfd02014-05-14 17:13:14 +02001581 # test keywords
1582 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1583 '\xe4\xf6\xfc')
1584 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1585 '[]')
1586
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001587 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001588 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1589 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001590 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001591 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001592 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001593 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001594
Victor Stinnera57dfd02014-05-14 17:13:14 +02001595 # test keywords
1596 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1597 b'\xe4\xf6\xfc')
1598 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1599 b'[]')
1600
Walter Dörwald063e1e82004-10-28 13:04:26 +00001601 def test_register(self):
1602 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001603 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001604
1605 def test_lookup(self):
1606 self.assertRaises(TypeError, codecs.lookup)
1607 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001608 self.assertRaises(LookupError, codecs.lookup, " ")
1609
1610 def test_getencoder(self):
1611 self.assertRaises(TypeError, codecs.getencoder)
1612 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1613
1614 def test_getdecoder(self):
1615 self.assertRaises(TypeError, codecs.getdecoder)
1616 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1617
1618 def test_getreader(self):
1619 self.assertRaises(TypeError, codecs.getreader)
1620 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1621
1622 def test_getwriter(self):
1623 self.assertRaises(TypeError, codecs.getwriter)
1624 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001625
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001626 def test_lookup_issue1813(self):
1627 # Issue #1813: under Turkish locales, lookup of some codecs failed
1628 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001629 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001630 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1631 try:
1632 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1633 except locale.Error:
1634 # Unsupported locale on this system
1635 self.skipTest('test needs Turkish locale')
1636 c = codecs.lookup('ASCII')
1637 self.assertEqual(c.name, 'ascii')
1638
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001639 def test_all(self):
1640 api = (
1641 "encode", "decode",
1642 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1643 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1644 "getencoder", "getdecoder", "getincrementalencoder",
1645 "getincrementaldecoder", "getreader", "getwriter",
1646 "register_error", "lookup_error",
1647 "strict_errors", "replace_errors", "ignore_errors",
1648 "xmlcharrefreplace_errors", "backslashreplace_errors",
1649 "namereplace_errors",
1650 "open", "EncodedFile",
1651 "iterencode", "iterdecode",
1652 "BOM", "BOM_BE", "BOM_LE",
1653 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1654 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1655 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1656 "StreamReaderWriter", "StreamRecoder",
1657 )
1658 self.assertCountEqual(api, codecs.__all__)
1659 for api in codecs.__all__:
1660 getattr(codecs, api)
1661
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001662 def test_open(self):
1663 self.addCleanup(support.unlink, support.TESTFN)
1664 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1665 with self.subTest(mode), \
1666 codecs.open(support.TESTFN, mode, 'ascii') as file:
1667 self.assertIsInstance(file, codecs.StreamReaderWriter)
1668
1669 def test_undefined(self):
1670 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1671 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1672 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1673 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1674 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1675 self.assertRaises(UnicodeError,
1676 codecs.encode, 'abc', 'undefined', errors)
1677 self.assertRaises(UnicodeError,
1678 codecs.decode, b'abc', 'undefined', errors)
1679
Victor Stinnerf96418d2015-09-21 23:06:27 +02001680
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001681class StreamReaderTest(unittest.TestCase):
1682
1683 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001684 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001685 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001686
1687 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001688 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001689 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001690
Victor Stinnerf96418d2015-09-21 23:06:27 +02001691
Thomas Wouters89f507f2006-12-13 04:49:30 +00001692class EncodedFileTest(unittest.TestCase):
1693
1694 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001695 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001696 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001697 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001698
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001699 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001700 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001701 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001702 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001703
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001704all_unicode_encodings = [
1705 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001706 "big5",
1707 "big5hkscs",
1708 "charmap",
1709 "cp037",
1710 "cp1006",
1711 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001712 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001713 "cp1140",
1714 "cp1250",
1715 "cp1251",
1716 "cp1252",
1717 "cp1253",
1718 "cp1254",
1719 "cp1255",
1720 "cp1256",
1721 "cp1257",
1722 "cp1258",
1723 "cp424",
1724 "cp437",
1725 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001726 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001727 "cp737",
1728 "cp775",
1729 "cp850",
1730 "cp852",
1731 "cp855",
1732 "cp856",
1733 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001734 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001735 "cp860",
1736 "cp861",
1737 "cp862",
1738 "cp863",
1739 "cp864",
1740 "cp865",
1741 "cp866",
1742 "cp869",
1743 "cp874",
1744 "cp875",
1745 "cp932",
1746 "cp949",
1747 "cp950",
1748 "euc_jis_2004",
1749 "euc_jisx0213",
1750 "euc_jp",
1751 "euc_kr",
1752 "gb18030",
1753 "gb2312",
1754 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001755 "hp_roman8",
1756 "hz",
1757 "idna",
1758 "iso2022_jp",
1759 "iso2022_jp_1",
1760 "iso2022_jp_2",
1761 "iso2022_jp_2004",
1762 "iso2022_jp_3",
1763 "iso2022_jp_ext",
1764 "iso2022_kr",
1765 "iso8859_1",
1766 "iso8859_10",
1767 "iso8859_11",
1768 "iso8859_13",
1769 "iso8859_14",
1770 "iso8859_15",
1771 "iso8859_16",
1772 "iso8859_2",
1773 "iso8859_3",
1774 "iso8859_4",
1775 "iso8859_5",
1776 "iso8859_6",
1777 "iso8859_7",
1778 "iso8859_8",
1779 "iso8859_9",
1780 "johab",
1781 "koi8_r",
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03001782 "koi8_t",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001783 "koi8_u",
Serhiy Storchakaad8a1c32015-05-12 23:16:55 +03001784 "kz1048",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001785 "latin_1",
1786 "mac_cyrillic",
1787 "mac_greek",
1788 "mac_iceland",
1789 "mac_latin2",
1790 "mac_roman",
1791 "mac_turkish",
1792 "palmos",
1793 "ptcp154",
1794 "punycode",
1795 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001796 "shift_jis",
1797 "shift_jis_2004",
1798 "shift_jisx0213",
1799 "tis_620",
1800 "unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001801 "utf_16",
1802 "utf_16_be",
1803 "utf_16_le",
1804 "utf_7",
1805 "utf_8",
1806]
1807
1808if hasattr(codecs, "mbcs_encode"):
1809 all_unicode_encodings.append("mbcs")
Steve Dowerf5aba582016-09-06 19:42:27 -07001810if hasattr(codecs, "oem_encode"):
1811 all_unicode_encodings.append("oem")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001812
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001813# The following encoding is not tested, because it's not supposed
1814# to work:
1815# "undefined"
1816
1817# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001818broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001819 "punycode",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001820]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001821
Victor Stinnerf96418d2015-09-21 23:06:27 +02001822
Walter Dörwald3abcb012007-04-16 22:10:50 +00001823class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001824 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001825 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001826 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001827 name = codecs.lookup(encoding).name
1828 if encoding.endswith("_codec"):
1829 name += "_codec"
1830 elif encoding == "latin_1":
1831 name = "latin_1"
1832 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001833
Inada Naoki6a16b182019-03-18 15:44:11 +09001834 (b, size) = codecs.getencoder(encoding)(s)
1835 self.assertEqual(size, len(s), "encoding=%r" % encoding)
1836 (chars, size) = codecs.getdecoder(encoding)(b)
1837 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001838
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001839 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001840 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001841 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001842 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001843 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001844 for c in s:
1845 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001846 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001847 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001848 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001849 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001850 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001851 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001852 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001853 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001854 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001855 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001856
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001857 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001858 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001859 try:
1860 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001861 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001862 pass
1863 else:
1864 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001865 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001866 for c in s:
1867 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001868 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001869 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001870 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001871 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001872 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001873 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001874 self.assertEqual(decodedresult, s,
1875 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001876
1877 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001878 result = "".join(codecs.iterdecode(
1879 codecs.iterencode(s, encoding), encoding))
1880 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001881
1882 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001883 result = "".join(codecs.iterdecode(
1884 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001885 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001886
Victor Stinner554f3f02010-06-16 23:33:54 +00001887 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001888 # check incremental decoder/encoder with errors argument
1889 try:
1890 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001891 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001892 pass
1893 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001894 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001895 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001896 decodedresult = "".join(decoder.decode(bytes([c]))
1897 for c in encodedresult)
1898 self.assertEqual(decodedresult, s,
1899 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001900
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001901 @support.cpython_only
1902 def test_basics_capi(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001903 s = "abc123" # all codecs should be able to encode these
1904 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001905 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001906 # check incremental decoder/encoder (fetched via the C API)
1907 try:
Victor Stinner3d4226a2018-08-29 22:21:32 +02001908 cencoder = _testcapi.codec_incrementalencoder(encoding)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001909 except LookupError: # no IncrementalEncoder
1910 pass
1911 else:
1912 # check C API
1913 encodedresult = b""
1914 for c in s:
1915 encodedresult += cencoder.encode(c)
1916 encodedresult += cencoder.encode("", True)
Victor Stinner3d4226a2018-08-29 22:21:32 +02001917 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001918 decodedresult = ""
1919 for c in encodedresult:
1920 decodedresult += cdecoder.decode(bytes([c]))
1921 decodedresult += cdecoder.decode(b"", True)
1922 self.assertEqual(decodedresult, s,
1923 "encoding=%r" % encoding)
1924
1925 if encoding not in ("idna", "mbcs"):
1926 # check incremental decoder/encoder with errors argument
1927 try:
Victor Stinner3d4226a2018-08-29 22:21:32 +02001928 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001929 except LookupError: # no IncrementalEncoder
1930 pass
1931 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001932 encodedresult = b"".join(cencoder.encode(c) for c in s)
Victor Stinner3d4226a2018-08-29 22:21:32 +02001933 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001934 decodedresult = "".join(cdecoder.decode(bytes([c]))
1935 for c in encodedresult)
1936 self.assertEqual(decodedresult, s,
1937 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001938
Walter Dörwald729c31f2005-03-14 19:06:30 +00001939 def test_seek(self):
1940 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001941 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001942 for encoding in all_unicode_encodings:
1943 if encoding == "idna": # FIXME: See SF bug #1163178
1944 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001945 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00001946 continue
Victor Stinner05010702011-05-27 16:50:40 +02001947 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001948 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001949 # Test that calling seek resets the internal codec state and buffers
1950 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001951 data = reader.read()
1952 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001953
Walter Dörwalde22d3392005-11-17 08:52:34 +00001954 def test_bad_decode_args(self):
1955 for encoding in all_unicode_encodings:
1956 decoder = codecs.getdecoder(encoding)
1957 self.assertRaises(TypeError, decoder)
1958 if encoding not in ("idna", "punycode"):
1959 self.assertRaises(TypeError, decoder, 42)
1960
1961 def test_bad_encode_args(self):
1962 for encoding in all_unicode_encodings:
1963 encoder = codecs.getencoder(encoding)
Inada Naoki6a16b182019-03-18 15:44:11 +09001964 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001965
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001966 def test_encoding_map_type_initialized(self):
1967 from encodings import cp1140
1968 # This used to crash, we are only verifying there's no crash.
1969 table_type = type(cp1140.encoding_table)
1970 self.assertEqual(table_type, table_type)
1971
Walter Dörwald3abcb012007-04-16 22:10:50 +00001972 def test_decoder_state(self):
1973 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001974 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001975 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001976 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00001977 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1978 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1979
Victor Stinnerf96418d2015-09-21 23:06:27 +02001980
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001981class CharmapTest(unittest.TestCase):
1982 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001983 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001984 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001985 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001986 )
1987
Ezio Melottib3aedd42010-11-20 19:04:17 +00001988 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02001989 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
1990 ("\U0010FFFFbc", 3)
1991 )
1992
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001993 self.assertRaises(UnicodeDecodeError,
1994 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1995 )
1996
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001997 self.assertRaises(UnicodeDecodeError,
1998 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
1999 )
2000
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002001 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002002 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002003 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002004 )
2005
Ezio Melottib3aedd42010-11-20 19:04:17 +00002006 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002007 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002008 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002009 )
2010
Ezio Melottib3aedd42010-11-20 19:04:17 +00002011 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002012 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2013 ("ab\\x02", 3)
2014 )
2015
2016 self.assertEqual(
2017 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2018 ("ab\\x02", 3)
2019 )
2020
2021 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002022 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002023 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002024 )
2025
Ezio Melottib3aedd42010-11-20 19:04:17 +00002026 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002027 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002028 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002029 )
2030
Guido van Rossum805365e2007-05-07 22:24:25 +00002031 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002032 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002033 codecs.charmap_decode(allbytes, "ignore", ""),
2034 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002035 )
2036
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002037 def test_decode_with_int2str_map(self):
2038 self.assertEqual(
2039 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2040 {0: 'a', 1: 'b', 2: 'c'}),
2041 ("abc", 3)
2042 )
2043
2044 self.assertEqual(
2045 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2046 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2047 ("AaBbCc", 3)
2048 )
2049
2050 self.assertEqual(
2051 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2052 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2053 ("\U0010FFFFbc", 3)
2054 )
2055
2056 self.assertEqual(
2057 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2058 {0: 'a', 1: 'b', 2: ''}),
2059 ("ab", 3)
2060 )
2061
2062 self.assertRaises(UnicodeDecodeError,
2063 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2064 {0: 'a', 1: 'b'}
2065 )
2066
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002067 self.assertRaises(UnicodeDecodeError,
2068 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2069 {0: 'a', 1: 'b', 2: None}
2070 )
2071
2072 # Issue #14850
2073 self.assertRaises(UnicodeDecodeError,
2074 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2075 {0: 'a', 1: 'b', 2: '\ufffe'}
2076 )
2077
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002078 self.assertEqual(
2079 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2080 {0: 'a', 1: 'b'}),
2081 ("ab\ufffd", 3)
2082 )
2083
2084 self.assertEqual(
2085 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2086 {0: 'a', 1: 'b', 2: None}),
2087 ("ab\ufffd", 3)
2088 )
2089
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002090 # Issue #14850
2091 self.assertEqual(
2092 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2093 {0: 'a', 1: 'b', 2: '\ufffe'}),
2094 ("ab\ufffd", 3)
2095 )
2096
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002097 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002098 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2099 {0: 'a', 1: 'b'}),
2100 ("ab\\x02", 3)
2101 )
2102
2103 self.assertEqual(
2104 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2105 {0: 'a', 1: 'b', 2: None}),
2106 ("ab\\x02", 3)
2107 )
2108
2109 # Issue #14850
2110 self.assertEqual(
2111 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2112 {0: 'a', 1: 'b', 2: '\ufffe'}),
2113 ("ab\\x02", 3)
2114 )
2115
2116 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002117 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2118 {0: 'a', 1: 'b'}),
2119 ("ab", 3)
2120 )
2121
2122 self.assertEqual(
2123 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2124 {0: 'a', 1: 'b', 2: None}),
2125 ("ab", 3)
2126 )
2127
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002128 # Issue #14850
2129 self.assertEqual(
2130 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2131 {0: 'a', 1: 'b', 2: '\ufffe'}),
2132 ("ab", 3)
2133 )
2134
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002135 allbytes = bytes(range(256))
2136 self.assertEqual(
2137 codecs.charmap_decode(allbytes, "ignore", {}),
2138 ("", len(allbytes))
2139 )
2140
2141 def test_decode_with_int2int_map(self):
2142 a = ord('a')
2143 b = ord('b')
2144 c = ord('c')
2145
2146 self.assertEqual(
2147 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2148 {0: a, 1: b, 2: c}),
2149 ("abc", 3)
2150 )
2151
2152 # Issue #15379
2153 self.assertEqual(
2154 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2155 {0: 0x10FFFF, 1: b, 2: c}),
2156 ("\U0010FFFFbc", 3)
2157 )
2158
Antoine Pitroua1f76552012-09-23 20:00:04 +02002159 self.assertEqual(
2160 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2161 {0: sys.maxunicode, 1: b, 2: c}),
2162 (chr(sys.maxunicode) + "bc", 3)
2163 )
2164
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002165 self.assertRaises(TypeError,
2166 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002167 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002168 )
2169
2170 self.assertRaises(UnicodeDecodeError,
2171 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2172 {0: a, 1: b},
2173 )
2174
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002175 self.assertRaises(UnicodeDecodeError,
2176 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2177 {0: a, 1: b, 2: 0xFFFE},
2178 )
2179
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002180 self.assertEqual(
2181 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2182 {0: a, 1: b}),
2183 ("ab\ufffd", 3)
2184 )
2185
2186 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002187 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2188 {0: a, 1: b, 2: 0xFFFE}),
2189 ("ab\ufffd", 3)
2190 )
2191
2192 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002193 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2194 {0: a, 1: b}),
2195 ("ab\\x02", 3)
2196 )
2197
2198 self.assertEqual(
2199 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2200 {0: a, 1: b, 2: 0xFFFE}),
2201 ("ab\\x02", 3)
2202 )
2203
2204 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002205 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2206 {0: a, 1: b}),
2207 ("ab", 3)
2208 )
2209
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002210 self.assertEqual(
2211 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2212 {0: a, 1: b, 2: 0xFFFE}),
2213 ("ab", 3)
2214 )
2215
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002216
Thomas Wouters89f507f2006-12-13 04:49:30 +00002217class WithStmtTest(unittest.TestCase):
2218 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002219 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002220 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2221 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002222 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002223
2224 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002225 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002226 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002227 with codecs.StreamReaderWriter(f, info.streamreader,
2228 info.streamwriter, 'strict') as srw:
2229 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002230
Victor Stinnerf96418d2015-09-21 23:06:27 +02002231
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002232class TypesTest(unittest.TestCase):
2233 def test_decode_unicode(self):
2234 # Most decoders don't accept unicode input
2235 decoders = [
2236 codecs.utf_7_decode,
2237 codecs.utf_8_decode,
2238 codecs.utf_16_le_decode,
2239 codecs.utf_16_be_decode,
2240 codecs.utf_16_ex_decode,
2241 codecs.utf_32_decode,
2242 codecs.utf_32_le_decode,
2243 codecs.utf_32_be_decode,
2244 codecs.utf_32_ex_decode,
2245 codecs.latin_1_decode,
2246 codecs.ascii_decode,
2247 codecs.charmap_decode,
2248 ]
2249 if hasattr(codecs, "mbcs_decode"):
2250 decoders.append(codecs.mbcs_decode)
2251 for decoder in decoders:
2252 self.assertRaises(TypeError, decoder, "xxx")
2253
2254 def test_unicode_escape(self):
Martin Panter119e5022016-04-16 09:28:57 +00002255 # Escape-decoding a unicode string is supported and gives the same
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002256 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002257 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2258 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2259 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2260 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002261
Victor Stinnere3b47152011-12-09 20:49:49 +01002262 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2263 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002264 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2265 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002266
2267 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2268 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002269 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2270 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002271
Serhiy Storchakad6793772013-01-29 10:20:44 +02002272
2273class UnicodeEscapeTest(unittest.TestCase):
2274 def test_empty(self):
2275 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2276 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2277
2278 def test_raw_encode(self):
2279 encode = codecs.unicode_escape_encode
2280 for b in range(32, 127):
2281 if b != b'\\'[0]:
2282 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2283
2284 def test_raw_decode(self):
2285 decode = codecs.unicode_escape_decode
2286 for b in range(256):
2287 if b != b'\\'[0]:
2288 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2289
2290 def test_escape_encode(self):
2291 encode = codecs.unicode_escape_encode
2292 check = coding_checker(self, encode)
2293 check('\t', br'\t')
2294 check('\n', br'\n')
2295 check('\r', br'\r')
2296 check('\\', br'\\')
2297 for b in range(32):
2298 if chr(b) not in '\t\n\r':
2299 check(chr(b), ('\\x%02x' % b).encode())
2300 for b in range(127, 256):
2301 check(chr(b), ('\\x%02x' % b).encode())
2302 check('\u20ac', br'\u20ac')
2303 check('\U0001d120', br'\U0001d120')
2304
2305 def test_escape_decode(self):
2306 decode = codecs.unicode_escape_decode
2307 check = coding_checker(self, decode)
2308 check(b"[\\\n]", "[]")
2309 check(br'[\"]', '["]')
2310 check(br"[\']", "[']")
2311 check(br"[\\]", r"[\]")
2312 check(br"[\a]", "[\x07]")
2313 check(br"[\b]", "[\x08]")
2314 check(br"[\t]", "[\x09]")
2315 check(br"[\n]", "[\x0a]")
2316 check(br"[\v]", "[\x0b]")
2317 check(br"[\f]", "[\x0c]")
2318 check(br"[\r]", "[\x0d]")
2319 check(br"[\7]", "[\x07]")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002320 check(br"[\78]", "[\x078]")
2321 check(br"[\41]", "[!]")
2322 check(br"[\418]", "[!8]")
2323 check(br"[\101]", "[A]")
2324 check(br"[\1010]", "[A0]")
2325 check(br"[\x41]", "[A]")
2326 check(br"[\x410]", "[A0]")
2327 check(br"\u20ac", "\u20ac")
2328 check(br"\U0001d120", "\U0001d120")
R David Murray110b6fe2016-09-08 15:34:08 -04002329 for i in range(97, 123):
2330 b = bytes([i])
2331 if b not in b'abfnrtuvx':
2332 with self.assertWarns(DeprecationWarning):
2333 check(b"\\" + b, "\\" + chr(i))
2334 if b.upper() not in b'UN':
2335 with self.assertWarns(DeprecationWarning):
2336 check(b"\\" + b.upper(), "\\" + chr(i-32))
2337 with self.assertWarns(DeprecationWarning):
2338 check(br"\8", "\\8")
2339 with self.assertWarns(DeprecationWarning):
2340 check(br"\9", "\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03002341 with self.assertWarns(DeprecationWarning):
2342 check(b"\\\xfa", "\\\xfa")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002343
2344 def test_decode_errors(self):
2345 decode = codecs.unicode_escape_decode
2346 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2347 for i in range(d):
2348 self.assertRaises(UnicodeDecodeError, decode,
2349 b"\\" + c + b"0"*i)
2350 self.assertRaises(UnicodeDecodeError, decode,
2351 b"[\\" + c + b"0"*i + b"]")
2352 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2353 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2354 self.assertEqual(decode(data, "replace"),
2355 ("[\ufffd]\ufffd", len(data)))
2356 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2357 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2358 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2359
2360
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002361class RawUnicodeEscapeTest(unittest.TestCase):
2362 def test_empty(self):
2363 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2364 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2365
2366 def test_raw_encode(self):
2367 encode = codecs.raw_unicode_escape_encode
2368 for b in range(256):
2369 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2370
2371 def test_raw_decode(self):
2372 decode = codecs.raw_unicode_escape_decode
2373 for b in range(256):
2374 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2375
2376 def test_escape_encode(self):
2377 encode = codecs.raw_unicode_escape_encode
2378 check = coding_checker(self, encode)
2379 for b in range(256):
2380 if b not in b'uU':
2381 check('\\' + chr(b), b'\\' + bytes([b]))
2382 check('\u20ac', br'\u20ac')
2383 check('\U0001d120', br'\U0001d120')
2384
2385 def test_escape_decode(self):
2386 decode = codecs.raw_unicode_escape_decode
2387 check = coding_checker(self, decode)
2388 for b in range(256):
2389 if b not in b'uU':
2390 check(b'\\' + bytes([b]), '\\' + chr(b))
2391 check(br"\u20ac", "\u20ac")
2392 check(br"\U0001d120", "\U0001d120")
2393
2394 def test_decode_errors(self):
2395 decode = codecs.raw_unicode_escape_decode
2396 for c, d in (b'u', 4), (b'U', 4):
2397 for i in range(d):
2398 self.assertRaises(UnicodeDecodeError, decode,
2399 b"\\" + c + b"0"*i)
2400 self.assertRaises(UnicodeDecodeError, decode,
2401 b"[\\" + c + b"0"*i + b"]")
2402 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2403 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2404 self.assertEqual(decode(data, "replace"),
2405 ("[\ufffd]\ufffd", len(data)))
2406 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2407 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2408 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2409
2410
Berker Peksag4a72a7b2016-09-16 17:31:06 +03002411class EscapeEncodeTest(unittest.TestCase):
2412
2413 def test_escape_encode(self):
2414 tests = [
2415 (b'', (b'', 0)),
2416 (b'foobar', (b'foobar', 6)),
2417 (b'spam\0eggs', (b'spam\\x00eggs', 9)),
2418 (b'a\'b', (b"a\\'b", 3)),
2419 (b'b\\c', (b'b\\\\c', 3)),
2420 (b'c\nd', (b'c\\nd', 3)),
2421 (b'd\re', (b'd\\re', 3)),
2422 (b'f\x7fg', (b'f\\x7fg', 3)),
2423 ]
2424 for data, output in tests:
2425 with self.subTest(data=data):
2426 self.assertEqual(codecs.escape_encode(data), output)
2427 self.assertRaises(TypeError, codecs.escape_encode, 'spam')
2428 self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam'))
2429
2430
Martin v. Löwis43c57782009-05-10 08:15:24 +00002431class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002432
2433 def test_utf8(self):
2434 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002435 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002436 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002437 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002438 b"foo\x80bar")
2439 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002440 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002441 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002442 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002443 b"\xed\xb0\x80")
2444
2445 def test_ascii(self):
2446 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002447 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002448 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002449 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002450 b"foo\x80bar")
2451
2452 def test_charmap(self):
2453 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002454 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002455 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002456 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002457 b"foo\xa5bar")
2458
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002459 def test_latin1(self):
2460 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002461 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002462 b"\xe4\xeb\xef\xf6\xfc")
2463
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002464
Victor Stinner3fed0872010-05-22 02:16:27 +00002465class BomTest(unittest.TestCase):
2466 def test_seek0(self):
2467 data = "1234567890"
2468 tests = ("utf-16",
2469 "utf-16-le",
2470 "utf-16-be",
2471 "utf-32",
2472 "utf-32-le",
2473 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002474 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002475 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002476 # Check if the BOM is written only once
2477 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002478 f.write(data)
2479 f.write(data)
2480 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002481 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002482 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002483 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002484
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002485 # Check that the BOM is written after a seek(0)
2486 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2487 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002488 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002489 f.seek(0)
2490 f.write(data)
2491 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002492 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002493
2494 # (StreamWriter) Check that the BOM is written after a seek(0)
2495 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002496 f.writer.write(data[0])
2497 self.assertNotEqual(f.writer.tell(), 0)
2498 f.writer.seek(0)
2499 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002500 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002501 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002502
Victor Stinner05010702011-05-27 16:50:40 +02002503 # Check that the BOM is not written after a seek() at a position
2504 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002505 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2506 f.write(data)
2507 f.seek(f.tell())
2508 f.write(data)
2509 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002510 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002511
Victor Stinner05010702011-05-27 16:50:40 +02002512 # (StreamWriter) Check that the BOM is not written after a seek()
2513 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002514 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002515 f.writer.write(data)
2516 f.writer.seek(f.writer.tell())
2517 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002518 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002519 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002520
Victor Stinner3fed0872010-05-22 02:16:27 +00002521
Georg Brandl02524622010-12-02 18:06:51 +00002522bytes_transform_encodings = [
2523 "base64_codec",
2524 "uu_codec",
2525 "quopri_codec",
2526 "hex_codec",
2527]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002528
2529transform_aliases = {
2530 "base64_codec": ["base64", "base_64"],
2531 "uu_codec": ["uu"],
2532 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2533 "hex_codec": ["hex"],
2534 "rot_13": ["rot13"],
2535}
2536
Georg Brandl02524622010-12-02 18:06:51 +00002537try:
2538 import zlib
2539except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002540 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002541else:
2542 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002543 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002544try:
2545 import bz2
2546except ImportError:
2547 pass
2548else:
2549 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002550 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002551
Victor Stinnerf96418d2015-09-21 23:06:27 +02002552
Georg Brandl02524622010-12-02 18:06:51 +00002553class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002554
Georg Brandl02524622010-12-02 18:06:51 +00002555 def test_basics(self):
2556 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002557 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002558 with self.subTest(encoding=encoding):
2559 # generic codecs interface
2560 (o, size) = codecs.getencoder(encoding)(binput)
2561 self.assertEqual(size, len(binput))
2562 (i, size) = codecs.getdecoder(encoding)(o)
2563 self.assertEqual(size, len(o))
2564 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002565
Georg Brandl02524622010-12-02 18:06:51 +00002566 def test_read(self):
2567 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002568 with self.subTest(encoding=encoding):
2569 sin = codecs.encode(b"\x80", encoding)
2570 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2571 sout = reader.read()
2572 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002573
2574 def test_readline(self):
2575 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002576 with self.subTest(encoding=encoding):
2577 sin = codecs.encode(b"\x80", encoding)
2578 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2579 sout = reader.readline()
2580 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002581
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002582 def test_buffer_api_usage(self):
2583 # We check all the transform codecs accept memoryview input
2584 # for encoding and decoding
2585 # and also that they roundtrip correctly
2586 original = b"12345\x80"
2587 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002588 with self.subTest(encoding=encoding):
2589 data = original
2590 view = memoryview(data)
2591 data = codecs.encode(data, encoding)
2592 view_encoded = codecs.encode(view, encoding)
2593 self.assertEqual(view_encoded, data)
2594 view = memoryview(data)
2595 data = codecs.decode(data, encoding)
2596 self.assertEqual(data, original)
2597 view_decoded = codecs.decode(view, encoding)
2598 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002599
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002600 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002601 # Check binary -> binary codecs give a good error for str input
2602 bad_input = "bad input type"
2603 for encoding in bytes_transform_encodings:
2604 with self.subTest(encoding=encoding):
R David Murray44b548d2016-09-08 13:59:53 -04002605 fmt = (r"{!r} is not a text encoding; "
2606 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002607 msg = fmt.format(encoding)
2608 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002609 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002610 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002611
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002612 def test_text_to_binary_blacklists_text_transforms(self):
2613 # Check str.encode gives a good error message for str -> str codecs
2614 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002615 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002616 with self.assertRaisesRegex(LookupError, msg):
2617 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002618
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002619 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002620 # Check bytes.decode and bytearray.decode give a good error
2621 # message for binary -> binary codecs
2622 data = b"encode first to ensure we meet any format restrictions"
2623 for encoding in bytes_transform_encodings:
2624 with self.subTest(encoding=encoding):
2625 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002626 fmt = (r"{!r} is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002627 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002628 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002629 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002630 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002631 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002632 bytearray(encoded_data).decode(encoding)
2633
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002634 def test_binary_to_text_blacklists_text_transforms(self):
2635 # Check str -> str codec gives a good error for binary input
2636 for bad_input in (b"immutable", bytearray(b"mutable")):
2637 with self.subTest(bad_input=bad_input):
2638 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002639 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002640 with self.assertRaisesRegex(LookupError, msg) as failure:
2641 bad_input.decode("rot_13")
2642 self.assertIsNone(failure.exception.__cause__)
2643
Zachary Wareefa2e042013-12-30 14:54:11 -06002644 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002645 def test_custom_zlib_error_is_wrapped(self):
2646 # Check zlib codec gives a good error for malformed input
2647 msg = "^decoding with 'zlib_codec' codec failed"
2648 with self.assertRaisesRegex(Exception, msg) as failure:
2649 codecs.decode(b"hello", "zlib_codec")
2650 self.assertIsInstance(failure.exception.__cause__,
2651 type(failure.exception))
2652
2653 def test_custom_hex_error_is_wrapped(self):
2654 # Check hex codec gives a good error for malformed input
2655 msg = "^decoding with 'hex_codec' codec failed"
2656 with self.assertRaisesRegex(Exception, msg) as failure:
2657 codecs.decode(b"hello", "hex_codec")
2658 self.assertIsInstance(failure.exception.__cause__,
2659 type(failure.exception))
2660
2661 # Unfortunately, the bz2 module throws OSError, which the codec
2662 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002663
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002664 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2665 def test_aliases(self):
2666 for codec_name, aliases in transform_aliases.items():
2667 expected_name = codecs.lookup(codec_name).name
2668 for alias in aliases:
2669 with self.subTest(alias=alias):
2670 info = codecs.lookup(alias)
2671 self.assertEqual(info.name, expected_name)
2672
Martin Panter06171bd2015-09-12 00:34:28 +00002673 def test_quopri_stateless(self):
2674 # Should encode with quotetabs=True
2675 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2676 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2677 # But should still support unescaped tabs and spaces
2678 unescaped = b"space tab eol\n"
2679 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2680
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002681 def test_uu_invalid(self):
2682 # Missing "begin" line
2683 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2684
Nick Coghlan8b097b42013-11-13 23:49:21 +10002685
2686# The codec system tries to wrap exceptions in order to ensure the error
2687# mentions the operation being performed and the codec involved. We
2688# currently *only* want this to happen for relatively stateless
2689# exceptions, where the only significant information they contain is their
2690# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002691
2692# Use a local codec registry to avoid appearing to leak objects when
Martin Panter119e5022016-04-16 09:28:57 +00002693# registering multiple search functions
Nick Coghlan4e553e22013-11-16 00:35:34 +10002694_TEST_CODECS = {}
2695
2696def _get_test_codec(codec_name):
2697 return _TEST_CODECS.get(codec_name)
2698codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2699
Nick Coghlan8fad1672014-09-15 23:50:44 +12002700try:
2701 # Issue #22166: Also need to clear the internal cache in CPython
2702 from _codecs import _forget_codec
2703except ImportError:
2704 def _forget_codec(codec_name):
2705 pass
2706
2707
Nick Coghlan8b097b42013-11-13 23:49:21 +10002708class ExceptionChainingTest(unittest.TestCase):
2709
2710 def setUp(self):
2711 # There's no way to unregister a codec search function, so we just
2712 # ensure we render this one fairly harmless after the test
2713 # case finishes by using the test case repr as the codec name
2714 # The codecs module normalizes codec names, although this doesn't
2715 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002716 # We also make sure we use a truly unique id for the custom codec
2717 # to avoid issues with the codec cache when running these tests
2718 # multiple times (e.g. when hunting for refleaks)
2719 unique_id = repr(self) + str(id(self))
2720 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2721
2722 # We store the object to raise on the instance because of a bad
2723 # interaction between the codec caching (which means we can't
2724 # recreate the codec entry) and regrtest refleak hunting (which
2725 # runs the same test instance multiple times). This means we
2726 # need to ensure the codecs call back in to the instance to find
2727 # out which exception to raise rather than binding them in a
2728 # closure to an object that may change on the next run
2729 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002730
Nick Coghlan4e553e22013-11-16 00:35:34 +10002731 def tearDown(self):
2732 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002733 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2734 encodings._cache.pop(self.codec_name, None)
2735 try:
2736 _forget_codec(self.codec_name)
2737 except KeyError:
2738 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002739
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002740 def set_codec(self, encode, decode):
2741 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002742 name=self.codec_name)
2743 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002744
2745 @contextlib.contextmanager
2746 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002747 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002748 operation, self.codec_name, exc_type.__name__, msg)
2749 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2750 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002751 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002752 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002753
2754 def raise_obj(self, *args, **kwds):
2755 # Helper to dynamically change the object raised by a test codec
2756 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002757
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002758 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002759 self.obj_to_raise = obj_to_raise
2760 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002761 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002762 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002763 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002764 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002765 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002766 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002767 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002768 codecs.decode(b"bytes input", self.codec_name)
2769
2770 def test_raise_by_type(self):
2771 self.check_wrapped(RuntimeError, "")
2772
2773 def test_raise_by_value(self):
2774 msg = "This should be wrapped"
2775 self.check_wrapped(RuntimeError(msg), msg)
2776
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002777 def test_raise_grandchild_subclass_exact_size(self):
2778 msg = "This should be wrapped"
2779 class MyRuntimeError(RuntimeError):
2780 __slots__ = ()
2781 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2782
2783 def test_raise_subclass_with_weakref_support(self):
2784 msg = "This should be wrapped"
2785 class MyRuntimeError(RuntimeError):
2786 pass
2787 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2788
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002789 def check_not_wrapped(self, obj_to_raise, msg):
2790 def raise_obj(*args, **kwds):
2791 raise obj_to_raise
2792 self.set_codec(raise_obj, raise_obj)
2793 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002794 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002795 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002796 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002797 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002798 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002799 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002800 codecs.decode(b"bytes input", self.codec_name)
2801
2802 def test_init_override_is_not_wrapped(self):
2803 class CustomInit(RuntimeError):
2804 def __init__(self):
2805 pass
2806 self.check_not_wrapped(CustomInit, "")
2807
2808 def test_new_override_is_not_wrapped(self):
2809 class CustomNew(RuntimeError):
2810 def __new__(cls):
2811 return super().__new__(cls)
2812 self.check_not_wrapped(CustomNew, "")
2813
2814 def test_instance_attribute_is_not_wrapped(self):
2815 msg = "This should NOT be wrapped"
2816 exc = RuntimeError(msg)
2817 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002818 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002819
2820 def test_non_str_arg_is_not_wrapped(self):
2821 self.check_not_wrapped(RuntimeError(1), "1")
2822
2823 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002824 msg_re = r"^\('a', 'b', 'c'\)$"
2825 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002826
2827 # http://bugs.python.org/issue19609
2828 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002829 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002830 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002831 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002832 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002833 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002834 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002835 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002836 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002837 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002838 codecs.decode(b"bytes input", self.codec_name)
2839
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002840 def test_unflagged_non_text_codec_handling(self):
2841 # The stdlib non-text codecs are now marked so they're
2842 # pre-emptively skipped by the text model related methods
2843 # However, third party codecs won't be flagged, so we still make
2844 # sure the case where an inappropriate output type is produced is
2845 # handled appropriately
2846 def encode_to_str(*args, **kwds):
2847 return "not bytes!", 0
2848 def decode_to_bytes(*args, **kwds):
2849 return b"not str!", 0
2850 self.set_codec(encode_to_str, decode_to_bytes)
2851 # No input or output type checks on the codecs module functions
2852 encoded = codecs.encode(None, self.codec_name)
2853 self.assertEqual(encoded, "not bytes!")
2854 decoded = codecs.decode(None, self.codec_name)
2855 self.assertEqual(decoded, b"not str!")
2856 # Text model methods should complain
2857 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
R David Murray44b548d2016-09-08 13:59:53 -04002858 r"use codecs.encode\(\) to encode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002859 msg = fmt.format(self.codec_name)
2860 with self.assertRaisesRegex(TypeError, msg):
2861 "str_input".encode(self.codec_name)
2862 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
R David Murray44b548d2016-09-08 13:59:53 -04002863 r"use codecs.decode\(\) to decode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002864 msg = fmt.format(self.codec_name)
2865 with self.assertRaisesRegex(TypeError, msg):
2866 b"bytes input".decode(self.codec_name)
2867
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002868
Georg Brandl02524622010-12-02 18:06:51 +00002869
Victor Stinner62be4fb2011-10-18 21:46:37 +02002870@unittest.skipUnless(sys.platform == 'win32',
2871 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002872class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002873 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002874 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002875
Victor Stinner3a50e702011-10-18 21:21:00 +02002876 def test_invalid_code_page(self):
2877 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2878 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002879 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2880 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002881
2882 def test_code_page_name(self):
2883 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2884 codecs.code_page_encode, 932, '\xff')
2885 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002886 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002887 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002888 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002889
2890 def check_decode(self, cp, tests):
2891 for raw, errors, expected in tests:
2892 if expected is not None:
2893 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01002894 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002895 except UnicodeDecodeError as err:
2896 self.fail('Unable to decode %a from "cp%s" with '
2897 'errors=%r: %s' % (raw, cp, errors, err))
2898 self.assertEqual(decoded[0], expected,
2899 '%a.decode("cp%s", %r)=%a != %a'
2900 % (raw, cp, errors, decoded[0], expected))
2901 # assert 0 <= decoded[1] <= len(raw)
2902 self.assertGreaterEqual(decoded[1], 0)
2903 self.assertLessEqual(decoded[1], len(raw))
2904 else:
2905 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01002906 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002907
2908 def check_encode(self, cp, tests):
2909 for text, errors, expected in tests:
2910 if expected is not None:
2911 try:
2912 encoded = codecs.code_page_encode(cp, text, errors)
2913 except UnicodeEncodeError as err:
2914 self.fail('Unable to encode %a to "cp%s" with '
2915 'errors=%r: %s' % (text, cp, errors, err))
2916 self.assertEqual(encoded[0], expected,
2917 '%a.encode("cp%s", %r)=%a != %a'
2918 % (text, cp, errors, encoded[0], expected))
2919 self.assertEqual(encoded[1], len(text))
2920 else:
2921 self.assertRaises(UnicodeEncodeError,
2922 codecs.code_page_encode, cp, text, errors)
2923
2924 def test_cp932(self):
2925 self.check_encode(932, (
2926 ('abc', 'strict', b'abc'),
2927 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002928 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002929 ('\xff', 'strict', None),
2930 ('[\xff]', 'ignore', b'[]'),
2931 ('[\xff]', 'replace', b'[y]'),
2932 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002933 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02002934 ('[\xff]', 'namereplace',
2935 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002936 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002937 ('\udcff', 'strict', None),
2938 ('[\udcff]', 'surrogateescape', b'[\xff]'),
2939 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002940 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002941 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002942 (b'abc', 'strict', 'abc'),
2943 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2944 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002945 (b'[\xff]', 'strict', None),
2946 (b'[\xff]', 'ignore', '[]'),
2947 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002948 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002949 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002950 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002951 (b'\x81\x00abc', 'strict', None),
2952 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002953 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01002954 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002955 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002956
2957 def test_cp1252(self):
2958 self.check_encode(1252, (
2959 ('abc', 'strict', b'abc'),
2960 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2961 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002962 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002963 ('\u0141', 'strict', None),
2964 ('\u0141', 'ignore', b''),
2965 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002966 ('\udc98', 'surrogateescape', b'\x98'),
2967 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002968 ))
2969 self.check_decode(1252, (
2970 (b'abc', 'strict', 'abc'),
2971 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2972 (b'\xff', 'strict', '\xff'),
2973 ))
2974
2975 def test_cp_utf7(self):
2976 cp = 65000
2977 self.check_encode(cp, (
2978 ('abc', 'strict', b'abc'),
2979 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2980 ('\U0010ffff', 'strict', b'+2//f/w-'),
2981 ('\udc80', 'strict', b'+3IA-'),
2982 ('\ufffd', 'strict', b'+//0-'),
2983 ))
2984 self.check_decode(cp, (
2985 (b'abc', 'strict', 'abc'),
2986 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2987 (b'+2//f/w-', 'strict', '\U0010ffff'),
2988 (b'+3IA-', 'strict', '\udc80'),
2989 (b'+//0-', 'strict', '\ufffd'),
2990 # invalid bytes
2991 (b'[+/]', 'strict', '[]'),
2992 (b'[\xff]', 'strict', '[\xff]'),
2993 ))
2994
Victor Stinner3a50e702011-10-18 21:21:00 +02002995 def test_multibyte_encoding(self):
2996 self.check_decode(932, (
2997 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2998 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2999 ))
3000 self.check_decode(self.CP_UTF8, (
3001 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3002 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3003 ))
Steve Dowerf5aba582016-09-06 19:42:27 -07003004 self.check_encode(self.CP_UTF8, (
3005 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3006 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3007 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003008
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003009 def test_code_page_decode_flags(self):
3010 # Issue #36312: For some code pages (e.g. UTF-7) flags for
3011 # MultiByteToWideChar() must be set to 0.
Paul Monson62dfd7d2019-04-25 11:36:45 -07003012 if support.verbose:
3013 sys.stdout.write('\n')
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003014 for cp in (50220, 50221, 50222, 50225, 50227, 50229,
3015 *range(57002, 57011+1), 65000):
Paul Monson62dfd7d2019-04-25 11:36:45 -07003016 # On small versions of Windows like Windows IoT
3017 # not all codepages are present.
3018 # A missing codepage causes an OSError exception
3019 # so check for the codepage before decoding
3020 if is_code_page_present(cp):
3021 self.assertEqual(codecs.code_page_decode(cp, b'abc'), ('abc', 3), f'cp{cp}')
3022 else:
3023 if support.verbose:
3024 print(f" skipping cp={cp}")
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003025 self.assertEqual(codecs.code_page_decode(42, b'abc'),
3026 ('\uf061\uf062\uf063', 3))
3027
Victor Stinner3a50e702011-10-18 21:21:00 +02003028 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003029 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3030 self.assertEqual(decoded, ('', 0))
3031
Victor Stinner3a50e702011-10-18 21:21:00 +02003032 decoded = codecs.code_page_decode(932,
3033 b'\xe9\x80\xe9', 'strict',
3034 False)
3035 self.assertEqual(decoded, ('\u9a3e', 2))
3036
3037 decoded = codecs.code_page_decode(932,
3038 b'\xe9\x80\xe9\x80', 'strict',
3039 False)
3040 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3041
3042 decoded = codecs.code_page_decode(932,
3043 b'abc', 'strict',
3044 False)
3045 self.assertEqual(decoded, ('abc', 3))
3046
Steve Dowerf5aba582016-09-06 19:42:27 -07003047 def test_mbcs_alias(self):
3048 # Check that looking up our 'default' codepage will return
3049 # mbcs when we don't have a more specific one available
Victor Stinner91106cd2017-12-13 12:29:09 +01003050 with mock.patch('_winapi.GetACP', return_value=123):
Steve Dowerf5aba582016-09-06 19:42:27 -07003051 codec = codecs.lookup('cp123')
3052 self.assertEqual(codec.name, 'mbcs')
Steve Dowerf5aba582016-09-06 19:42:27 -07003053
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003054 @support.bigmemtest(size=2**31, memuse=7, dry_run=False)
3055 def test_large_input(self):
3056 # Test input longer than INT_MAX.
3057 # Input should contain undecodable bytes before and after
3058 # the INT_MAX limit.
3059 encoded = (b'01234567' * (2**28-1) +
3060 b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff')
3061 self.assertEqual(len(encoded), 2**31+2)
3062 decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True)
3063 self.assertEqual(decoded[1], len(encoded))
3064 del encoded
3065 self.assertEqual(len(decoded[0]), decoded[1])
3066 self.assertEqual(decoded[0][:10], '0123456701')
3067 self.assertEqual(decoded[0][-20:],
3068 '6701234567'
3069 '\udc85\udc86\udcea\udceb\udcec'
3070 '\udcef\udcfc\udcfd\udcfe\udcff')
3071
Victor Stinner3a50e702011-10-18 21:21:00 +02003072
Victor Stinnerf96418d2015-09-21 23:06:27 +02003073class ASCIITest(unittest.TestCase):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003074 def test_encode(self):
3075 self.assertEqual('abc123'.encode('ascii'), b'abc123')
3076
3077 def test_encode_error(self):
3078 for data, error_handler, expected in (
3079 ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3080 ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3081 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003082 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3083 b'[\\x80\\xff\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003084 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3085 ):
3086 with self.subTest(data=data, error_handler=error_handler,
3087 expected=expected):
3088 self.assertEqual(data.encode('ascii', error_handler),
3089 expected)
3090
3091 def test_encode_surrogateescape_error(self):
3092 with self.assertRaises(UnicodeEncodeError):
3093 # the first character can be decoded, but not the second
3094 '\udc80\xff'.encode('ascii', 'surrogateescape')
3095
Victor Stinnerf96418d2015-09-21 23:06:27 +02003096 def test_decode(self):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003097 self.assertEqual(b'abc'.decode('ascii'), 'abc')
3098
3099 def test_decode_error(self):
Victor Stinnerf96418d2015-09-21 23:06:27 +02003100 for data, error_handler, expected in (
3101 (b'[\x80\xff]', 'ignore', '[]'),
3102 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3103 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3104 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3105 ):
3106 with self.subTest(data=data, error_handler=error_handler,
3107 expected=expected):
3108 self.assertEqual(data.decode('ascii', error_handler),
3109 expected)
3110
3111
Victor Stinnerc3713e92015-09-29 12:32:13 +02003112class Latin1Test(unittest.TestCase):
3113 def test_encode(self):
3114 for data, expected in (
3115 ('abc', b'abc'),
3116 ('\x80\xe9\xff', b'\x80\xe9\xff'),
3117 ):
3118 with self.subTest(data=data, expected=expected):
3119 self.assertEqual(data.encode('latin1'), expected)
3120
3121 def test_encode_errors(self):
3122 for data, error_handler, expected in (
3123 ('[\u20ac\udc80]', 'ignore', b'[]'),
3124 ('[\u20ac\udc80]', 'replace', b'[??]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003125 ('[\u20ac\U000abcde]', 'backslashreplace',
3126 b'[\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003127 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3128 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3129 ):
3130 with self.subTest(data=data, error_handler=error_handler,
3131 expected=expected):
3132 self.assertEqual(data.encode('latin1', error_handler),
3133 expected)
3134
3135 def test_encode_surrogateescape_error(self):
3136 with self.assertRaises(UnicodeEncodeError):
3137 # the first character can be decoded, but not the second
3138 '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3139
3140 def test_decode(self):
3141 for data, expected in (
3142 (b'abc', 'abc'),
3143 (b'[\x80\xff]', '[\x80\xff]'),
3144 ):
3145 with self.subTest(data=data, expected=expected):
3146 self.assertEqual(data.decode('latin1'), expected)
3147
3148
Victor Stinner3d4226a2018-08-29 22:21:32 +02003149@unittest.skipIf(_testcapi is None, 'need _testcapi module')
3150class LocaleCodecTest(unittest.TestCase):
3151 """
3152 Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex().
3153 """
3154 ENCODING = sys.getfilesystemencoding()
3155 STRINGS = ("ascii", "ulatin1:\xa7\xe9",
3156 "u255:\xff",
3157 "UCS:\xe9\u20ac\U0010ffff",
3158 "surrogates:\uDC80\uDCFF")
3159 BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff")
3160 SURROGATES = "\uDC80\uDCFF"
3161
3162 def encode(self, text, errors="strict"):
3163 return _testcapi.EncodeLocaleEx(text, 0, errors)
3164
3165 def check_encode_strings(self, errors):
3166 for text in self.STRINGS:
3167 with self.subTest(text=text):
3168 try:
3169 expected = text.encode(self.ENCODING, errors)
3170 except UnicodeEncodeError:
3171 with self.assertRaises(RuntimeError) as cm:
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003172 self.encode(text, errors)
Victor Stinner3d4226a2018-08-29 22:21:32 +02003173 errmsg = str(cm.exception)
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003174 self.assertRegex(errmsg, r"encode error: pos=[0-9]+, reason=")
Victor Stinner3d4226a2018-08-29 22:21:32 +02003175 else:
3176 encoded = self.encode(text, errors)
3177 self.assertEqual(encoded, expected)
3178
3179 def test_encode_strict(self):
3180 self.check_encode_strings("strict")
3181
3182 def test_encode_surrogateescape(self):
3183 self.check_encode_strings("surrogateescape")
3184
3185 def test_encode_surrogatepass(self):
3186 try:
3187 self.encode('', 'surrogatepass')
3188 except ValueError as exc:
3189 if str(exc) == 'unsupported error handler':
3190 self.skipTest(f"{self.ENCODING!r} encoder doesn't support "
3191 f"surrogatepass error handler")
3192 else:
3193 raise
3194
3195 self.check_encode_strings("surrogatepass")
3196
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003197 def test_encode_unsupported_error_handler(self):
3198 with self.assertRaises(ValueError) as cm:
3199 self.encode('', 'backslashreplace')
3200 self.assertEqual(str(cm.exception), 'unsupported error handler')
3201
Victor Stinner3d4226a2018-08-29 22:21:32 +02003202 def decode(self, encoded, errors="strict"):
3203 return _testcapi.DecodeLocaleEx(encoded, 0, errors)
3204
3205 def check_decode_strings(self, errors):
3206 is_utf8 = (self.ENCODING == "utf-8")
3207 if is_utf8:
3208 encode_errors = 'surrogateescape'
3209 else:
3210 encode_errors = 'strict'
3211
3212 strings = list(self.BYTES_STRINGS)
3213 for text in self.STRINGS:
3214 try:
3215 encoded = text.encode(self.ENCODING, encode_errors)
3216 if encoded not in strings:
3217 strings.append(encoded)
3218 except UnicodeEncodeError:
3219 encoded = None
3220
3221 if is_utf8:
3222 encoded2 = text.encode(self.ENCODING, 'surrogatepass')
3223 if encoded2 != encoded:
3224 strings.append(encoded2)
3225
3226 for encoded in strings:
3227 with self.subTest(encoded=encoded):
3228 try:
3229 expected = encoded.decode(self.ENCODING, errors)
3230 except UnicodeDecodeError:
3231 with self.assertRaises(RuntimeError) as cm:
3232 self.decode(encoded, errors)
3233 errmsg = str(cm.exception)
3234 self.assertTrue(errmsg.startswith("decode error: "), errmsg)
3235 else:
3236 decoded = self.decode(encoded, errors)
3237 self.assertEqual(decoded, expected)
3238
3239 def test_decode_strict(self):
3240 self.check_decode_strings("strict")
3241
3242 def test_decode_surrogateescape(self):
3243 self.check_decode_strings("surrogateescape")
3244
3245 def test_decode_surrogatepass(self):
3246 try:
3247 self.decode(b'', 'surrogatepass')
3248 except ValueError as exc:
3249 if str(exc) == 'unsupported error handler':
3250 self.skipTest(f"{self.ENCODING!r} decoder doesn't support "
3251 f"surrogatepass error handler")
3252 else:
3253 raise
3254
3255 self.check_decode_strings("surrogatepass")
3256
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003257 def test_decode_unsupported_error_handler(self):
3258 with self.assertRaises(ValueError) as cm:
3259 self.decode(b'', 'backslashreplace')
3260 self.assertEqual(str(cm.exception), 'unsupported error handler')
3261
Victor Stinner3d4226a2018-08-29 22:21:32 +02003262
Fred Drake2e2be372001-09-20 21:33:42 +00003263if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003264 unittest.main()